# countHours.py demos a filter, GZIPPED and CSV file I/O.
# D. Parson Fall 2024
import sys          # sys.argv for command line, sys.exit(N), etc.
import csv          # read and write comma-separated value files
import gzip         # read and write compressed (gzipped) .gz files
import io           # in this demo used for file seeking
from pprint import pformat  # format output across lines & nested structures
# https://docs.python.org/3/library/sys.html
# https://docs.python.org/3/library/csv.html
# https://docs.python.org/3/library/gzip.html
# https://docs.python.org/3/library/io.html#io.IOBase.seek
# https://docs.python.org/3/library/pprint.html

if __name__ == '__main__':
    # entry point if entered from cmd line, not imported as a module
    if sys.argv[1].endswith('.csv.gz'):
        f = gzip.open(sys.argv[1], mode='rt') # read compressed text file
    elif sys.argv[1].endswith('.csv'):
        f = open(sys.argv[1], 'r')            # read regular text file
    else:
        raise ValueError('INVALID FILE EXTENSION ' + sys.argv[1])
    fcsv = csv.reader(f)
    # ^^^ wrap line-oriented text file to read comma-seprated values
    
    def filterFunc(row):    # filter out rows with hours outside 0..23
        '''
        Boolean-returning filter to return True if HOUR field in a row
        of data is in the range [0, 24), else returns False.
        '''
        try:    # throws an exception on a non-int, cells come in a strings
            hour = int(row[hourcol].strip())
        except: # could be invalid (e.g., truncated) line of data or non-int
            hour = -1   # invalid value
        return (hour >= 0 and hour <= 23)
    
    hdr = fcsv.__next__()   # .__next__() is cryptic way to read a row of data
    hourcol = hdr.index('HOUR')     # which column has HOUR data?
    print('hourcol', hourcol)

    # NOTE: The object returned by filter(...) is a special-purpose
    # GENERATOR. It returns values one at a time without expanding
    # the entire output list of values ahead of time, known as LAZY EVALUATION.
    # Code below shows two ways to iterate through the filtered data.
    # The next line constructs a filter that applies filterFunc to the
    # open csv.reader fcsv, which also uses LAZY EVALUATION to loop
    # through the open input file's rows (lines) of data.
    filt = filter(filterFunc, fcsv) # Loop through 200 (was 769,682)
                                    # filtered rows.
    count = {}
    for h in range(0,24):
        # dictionary mapping hour to the number of times it appears
        count[h] = 0
    filt = filter(filterFunc, fcsv) # filter object does lazy evaluation
    # Functions like filterFunc are 1ST CLASS FUNCTIONS IN PYTHON:
    # 1. You can store them in variables and return them from functions.
    # 2. Second class functions can be passed as function arguments.
    # 3. Third class functions can only be called (invoked).
    # All 2nd class functions are also 3rd class, and all 1st class functions
    #   are also 2nd class. First class functions can do it all.
    # filterFunc is being passed as an argument to filter, and
    # the return filt is a Python generator function store in a variable.


    # ITERATION 1 uses __next__() to retrieve the next row of data in a loop.
    # StopIteration exception is thrown by __next__() when filt is out of
    #   values. That is normal. The except clause here catches it.
    #   pass does nothing. This is normal generator behavior.
    try:
        while True:
            row = filt.__next__()
            hour = int(row[hourcol].strip())
            count[hour] += 1
    except StopIteration:   # __next__() aborts the loop
        pass
    print('PASS 1 count\n', pformat(count), '\nPASS 1 valid rows=',
        sum(count.values()))
    
    f.seek(0, io.SEEK_SET)      # Seek back to the beginning of the open file.
                                # fcsv just reads that initial row again.
    # https://docs.python.org/3/library/io.html#io.IOBase.seek
    for key in count.keys():    # Reset the counters to 0.
        count[key] = 0
    hdr = fcsv.__next__()   # .__next__() is cryptic way to read a row of data
    filt = filter(filterFunc, fcsv) # filter object does lazy evaluation

    # ITERATION 2 just iterates over filt which iterates over fcsv in turn.
    for row in filt:
        hour = int(row[hourcol].strip())
        count[hour] += 1
    print('\nPASS 2 count\n', pformat(count), '\nPASS 2 valid rows=',
        sum(count.values()))
    
    f.close() # Close your open files to recover resources, e.g. memory.