# countHours.py demos a filter, GZIPPED and CSV file I/O. # D. Parson Fall 2024 import sys # sys.argv for command line, sys.exit(N), etc. import csv # read and write comma-separated value files import gzip # read and write compressed (gzipped) .gz files import io # in this demo used for file seeking from pprint import pformat # format output across lines & nested structures # https://docs.python.org/3/library/sys.html # https://docs.python.org/3/library/csv.html # https://docs.python.org/3/library/gzip.html # https://docs.python.org/3/library/io.html#io.IOBase.seek # https://docs.python.org/3/library/pprint.html if __name__ == '__main__': # entry point if entered from cmd line, not imported as a module if sys.argv[1].endswith('.csv.gz'): f = gzip.open(sys.argv[1], mode='rt') # read compressed text file elif sys.argv[1].endswith('.csv'): f = open(sys.argv[1], 'r') # read regular text file else: raise ValueError('INVALID FILE EXTENSION ' + sys.argv[1]) fcsv = csv.reader(f) # ^^^ wrap line-oriented text file to read comma-seprated values def filterFunc(row): # filter out rows with hours outside 0..23 ''' Boolean-returning filter to return True if HOUR field in a row of data is in the range [0, 24), else returns False. ''' try: # throws an exception on a non-int, cells come in a strings hour = int(row[hourcol].strip()) except: # could be invalid (e.g., truncated) line of data or non-int hour = -1 # invalid value return (hour >= 0 and hour <= 23) hdr = fcsv.__next__() # .__next__() is cryptic way to read a row of data hourcol = hdr.index('HOUR') # which column has HOUR data? print('hourcol', hourcol) # NOTE: The object returned by filter(...) is a special-purpose # GENERATOR. It returns values one at a time without expanding # the entire output list of values ahead of time, known as LAZY EVALUATION. # Code below shows two ways to iterate through the filtered data. # The next line constructs a filter that applies filterFunc to the # open csv.reader fcsv, which also uses LAZY EVALUATION to loop # through the open input file's rows (lines) of data. filt = filter(filterFunc, fcsv) # Loop through 200 (was 769,682) # filtered rows. count = {} for h in range(0,24): # dictionary mapping hour to the number of times it appears count[h] = 0 filt = filter(filterFunc, fcsv) # filter object does lazy evaluation # Functions like filterFunc are 1ST CLASS FUNCTIONS IN PYTHON: # 1. You can store them in variables and return them from functions. # 2. Second class functions can be passed as function arguments. # 3. Third class functions can only be called (invoked). # All 2nd class functions are also 3rd class, and all 1st class functions # are also 2nd class. First class functions can do it all. # filterFunc is being passed as an argument to filter, and # the return filt is a Python generator function store in a variable. # ITERATION 1 uses __next__() to retrieve the next row of data in a loop. # StopIteration exception is thrown by __next__() when filt is out of # values. That is normal. The except clause here catches it. # pass does nothing. This is normal generator behavior. try: while True: row = filt.__next__() hour = int(row[hourcol].strip()) count[hour] += 1 except StopIteration: # __next__() aborts the loop pass print('PASS 1 count\n', pformat(count), '\nPASS 1 valid rows=', sum(count.values())) f.seek(0, io.SEEK_SET) # Seek back to the beginning of the open file. # fcsv just reads that initial row again. # https://docs.python.org/3/library/io.html#io.IOBase.seek for key in count.keys(): # Reset the counters to 0. count[key] = 0 hdr = fcsv.__next__() # .__next__() is cryptic way to read a row of data filt = filter(filterFunc, fcsv) # filter object does lazy evaluation # ITERATION 2 just iterates over filt which iterates over fcsv in turn. for row in filt: hour = int(row[hourcol].strip()) count[hour] += 1 print('\nPASS 2 count\n', pformat(count), '\nPASS 2 valid rows=', sum(count.values())) f.close() # Close your open files to recover resources, e.g. memory.