# Filename: CSC558F24Assn1HandoutGen.py adapted from a previous CSC223.
# ************************************************************
# Author:     Dr. Parson
# Student coauthor: 
# Major:      CS&IT professor
# Creation Date: 11/20/2023 through 8/7/2024
# Course:     CSC223 Fall 2023 and CSC 558 / 523 Fall 2024.
# Professor Name: D. Parson
# Input: First command line arg is a pseudo-random number seed, mandatory.
# Input: 2nd, mandatory command line arg for output CSV file name.
# Output: Named file per Input with an 14-column CSV output file, heading:
#
#   Distribution, Param1, Param2, Count, Mean, Median, Mode, Pstdev, Min, Max
#
#   Count is an int and Mean, Median, Mode, Pstdev must be rounded
#       to 6 places.
#
#   Distribution is one of 'uniform', 'normal', or 'exponential'.
#   Param1 is Low for uniform, Loc (mean) for normal,
#       scale (halfway point in distribution) for exponential.
#   Param2 is high for uniform, scale (standard deviation) for normal,
#       None (not used) for exponential.
#   Count is size for uniform, normal, and exponential.
#   Mean is stats.mean for the generated data in that distribution.
#       (sum of values / number of values).
#   HMean is the harmonic mean, reciprocal of the arithmetic mean of
#       the reciprocals of the data.
#   Median is stats.median for the generated data in that distribution.
#       (central value in the distribution)
#   Pstdev is stats.pstdev, population standard deviation.
#       help(stats.pstdev)
#       Help on function pstdev in module statistics:
#       pstdev(data, mu=None)
#           Return the square root of the population variance.
#           See ``pvariance`` for arguments and other details.
#   PVariance is the population variance of the data.
#   Min and Max are the built-in Python min() and max() results.
# https://numpy.org/doc/stable/reference/random/generator.html
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.uniform.html
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.normal.html
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html
# ************************************************************

import sys          # Used for argv command line arguments
import statistics as stats # (mean, median, mode, pstdev).
# ^^^ multimode not available in earlier libraries, mode throws a
# StatisticsError on multiple modes; pstdev is *population* standard deviation.
#   min() and max() are builtins
import numpy as np  # numpy.random.default_rng(seed=?) for distributions.
import csv

# DR. PARSON SUPPLIES makeUniform closure to bind generator parameters.
# STUDENTS must do the same for makeNormal and makeExponential.
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.uniform.html
def makeUniform(Low, high, size, seed):
    mygenerator =  np.random.default_rng(seed=seed)
    def returnUniform():
        return(('uniform', Low, high, size,
            mygenerator.uniform(Low, high, size)))
        # returns a 5-tuple with the name of the distribution
        # ('uniform', 'normal', or 'exponential'), the Param1, Param2,
        # and Count of the distribution, and a Count-size
        # np.ndarray of float values.
    return returnUniform

# STUDENT 1: 20% Write makeNormal(Loc, scale, size, seed)
#   similar to makeUniform, that returns a closure-function
#   that returns a 5-tuple per the specification comments at the
#   top of this file and the example of makeUniform.
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.normal.html
def makeNormal(Loc, scale, size, seed):
    pass    # Your code goes here.
    # PARSON:
    mygenerator =  np.random.default_rng(seed=seed)
    def returnNormal():
        return(('normal', Loc, scale, size,
            mygenerator.normal(Loc, scale, size)))
    return returnNormal

def makeBimodal(Loc, scale, size, seed):
    mygenerator =  np.random.default_rng(seed=seed)
    def returnNormal():
        lefthalf = list(mygenerator.normal(Loc-2*scale, scale, size))
        righthalf = list(mygenerator.normal(Loc+2*scale, scale, size))
        return(('bimodal', Loc, scale, size, (lefthalf+righthalf)))
    return returnNormal

# STUDENT 2: 20% Write makeExponential(scale, size, seed)
#   similar to makeUniform, that returns a closure-function
#   that returns a 5-tuple per the specification comments at the
#   top of this file and the example of makeUniform. Field [2] in
#   the returned 5-tuple is None becausr Param2 is not used by exponential.
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html
def makeExponential(scale, size, seed):
    pass    # Your code goes here.
    # PARSON:
    mygenerator =  np.random.default_rng(seed=seed)
    def returnExponential():
        return(('exponential', scale, 0, size,  # zero is a fake Param2
            mygenerator.exponential(scale, size)))
    return returnExponential

def makeReverseExponential(scale, size, seed):
    pass    # Your code goes here.
    # PARSON:
    mygenerator =  np.random.default_rng(seed=seed)
    def returnExponential():
        original = mygenerator.exponential(scale, size)
        maxv = max(original)
        # replacement = [abs(v-maxv) for v in original]
        replacement = [(maxv-v+1) for v in original]
        return('revexponential', scale, 0, size, replacement)
        # zero is a fake Param2 ^^^^^
    return returnExponential

# def makeBimodal(scale, size, seed):
#     # This uses exponential and revexponential.
#     mygenerator =  np.random.default_rng(seed=seed)
#     def returnBimodal():
#         lefthalf = list(mygenerator.exponential(scale, size//2))
#         original = mygenerator.exponential(scale, size//2)
#         maxv = max(original)
#         replacement = [(maxv-v+1) for v in original]
#         lefthalf.extend(replacement)
#         return('bimodal', scale, None, size, lefthalf)
#     return returnBimodal

# STUDENT 3: 20% Write generator generateDistribution() with these parameters
# that YIELDS the return value from calling distributionClosure with no
# arguments, doing that howManyTimesToYield times in a loop,
# then YIELDS None as sentinel value, then falls out the bottom
# (default return with no "return" statement is also None).
# See genASCII in DataflowGeneratorsOnePath.py for an example
# of yielding values.
# The yield of sentinel value None at the end is new for this assignment.
def generateDistribution(distributionClosure, howManyTimesToYield):
    for i in range(0, howManyTimesToYield):
        yield distributionClosure()
    yield None

# STUDENT 4: 20% Write generator generateStatisticalAnalysis that iterates
# over its predecessor generator until the predecessor YIELDS None as a
# sentinel value. Otherwise, create and YIELD a 10-tuple that consists
# of the first 4 fields of the incoming 5-tuple from the predecessor
# (Distribution, Param1, Param2, Count, ndarray), followed by the mean,
# median, mode, pstdev, min, & max of the incoming ndarray,
#   called from the stats module (min & max are builtins). YIELDS the 10-tuple:
# (Distribution, Param1, Param2, Count, Mean, Median, Mode, Pstdev, Min, Max)
# To compute mode do this:
#       try:
#           mode = round(stats.mode(tuple5[4]),6)
#       except stats.StatisticsError:
#           mode = None
# because stats.mode() raises StatisticsError when there is no unique mode.
# When predecessor YIELDS a value of None, break out of your loop,
# YIELD None (as a sentinel value), and fall out the bottom.
# ROUND each of the Mean through Max values to 0 places, e.g. round(value,0)
# We are doing that to try to get a mode value.
# See genASCII2Count in DataflowGeneratorsOnePath.py for an example
# of iterating over a predecessor generator and yielding values.
# The yield of sentinel value None at the end is new for this assignment.
outfileset = set([])    # set of distributions already written into CSV files
def generateStatisticalAnalysis(predecessor):
    global outfileset
    # Distribution, Param1, Param2, Count, Mean, Median, Mode, Pstdev, Min, Max
    # Distribution, Param1, Param2, Count come in from predecessor.
    # See DataflowGeneratorsOnePath.py's genASCII2Count
    # UPDATED JULY 2024 to mode and add hmean and pvariance and p25, p50, p75.
    # Distribution, Param1, Param2, Count, Mean, Hmean, Median, Pstdev,
    #   Pvariance, P25, P50, P75, Min, Max
    for tuple5 in predecessor:
        if tuple5 == None:
            break
        distname = tuple5[0]
        # distribution = [int(round(e,0)) for e in tuple5[4]]
        distribution = list(tuple5[4])  # numpy arrays' '+' adds elements
        mind = min(distribution)
        rmind = int(round(mind,0))  # a value > 0 such as 0.4 rounds to 0
        if rmind <= 0:       # harmonic mean cannot use these values:
            offset = abs(rmind) + 1
            d = [(element + offset) for element in distribution]
            distribution = d
        mind = min(distribution)
        maxd = max(distribution)
        ranged = float(maxd - mind)
        # print('mind maxd ranged of', distname, mind, maxd, ranged)
        # Now normalize all distributions into the range [1, 100] July 25, 2024:
        normd = []
        for d in distribution:
            nd = int(round(((d - mind) / ranged) * 100 + 1,0))
            if nd < 1:
                nd = 1
            elif nd > 100:
                nd = 100
            # print('nd for d', nd, d) ; sys.exit(0)
            normd.append(nd)
        distribution = normd
        if not distname in outfileset:
            of = open(distname + '.csv', 'w', newline='')
            ocsv = csv.writer(of)
            ocsv.writerow([distname])
            for v in distribution:
                ocsv.writerow([v])
            of.close()
            outfileset.add(distname)
        mean = int(round(stats.mean(distribution),0))
        hmean = int(round(stats.harmonic_mean(distribution),0))
        median = int(round(stats.median(distribution),0))
        pstdev = int(round(stats.pstdev(distribution),0))
        pvariance = int(round(stats.pvariance(distribution),0))
        minny = int(round(min(distribution),0))
        maxxy = int(round(max(distribution),0))
        percentiles = np.percentile(distribution, (25, 50, 75))
        result = (tuple5[0], tuple5[1], tuple5[2], tuple5[3],
            mean, hmean, median, pstdev, pvariance,
            int(round(percentiles[0],0)),
            int(round(percentiles[1],0)),
            int(round(percentiles[2],0)),
            minny, maxxy)
        yield result
    yield None

# STUDENT 5: 20% Write saveStatisticalAnalysisCSV that tests
# if writeHeader, and if "if writeHeader" succeeds, calls csvWriter.writerow
# with a CSV header row containing the following strings
# ['Distribution', 'Param1', 'Param2', 'Count', 'Mean', 'Median', 'Mode',
#   'Pstdev', 'Min', 'Max']
# Then, iterate over its predecessor and writerow the incoming 10-tuple
#   to csvWriter and then YIELD that 10-tuple. HOWEVER, when the incoming
#   value from predecessor is None, YIELD None (as a sentinel value),
#   and fall out the bottom.
# See sinkOutput2File in DataflowGeneratorsOnePath.py for an example
# of iterating over a predecessor generator and writing and yielding values.
# The yield of sentinel value None at the end is new for this assignment.
def saveStatisticalAnalysisCSV(predecessor, csvWriter, writeHeader):
    if writeHeader:
        csvWriter.writerow(['Distribution', 'Param1', 'Param2', 'Count',
            'Mean', 'Hmean', 'Median', 'Pstdev', 'Pvariance',
            'P25', 'P50', 'P75', 'Min', 'Max'])
    for tuple10 in predecessor:
        if tuple10 == None:
            break
        csvWriter.writerow(tuple10)
        yield tuple10
    yield None

__usage__ =                 \
    'USAGE: python CSC223f23DataflowAssn4.py SEED OUTFILE.csv'
# Symbol names with __underline__ should be private to their context.
if __name__ == '__main__':      # Entry code outside of any function.
    if len(sys.argv) != 3:
        # argv[0] is CSC223f23DataflowAssn4.py
        raise ValueError(__usage__)
        # https://docs.python.org/3.7/library/exceptions.html
    try:
        seed = int(sys.argv[1])
    except ValueError as badint:
        raise ValueError('Invalid, non-integer SEED on command line: '
            + sys.argv[1] + '\n' + __usage__)
    outcsvname = sys.argv[2]
    if not outcsvname.endswith('.csv'):
        raise ValueError('ERROR, OUTFILE must end in ".csv: "'
                + sys.argv[2] + '\n' + __usage__)
    outfile = open(outcsvname, 'w', newline='')
    outcsv = csv.writer(outfile)
    writeCSVheader = True
    howManyTimesToYield = 1000
    howManyTimesYielded = 0
    samplesPerDistribution = 10000
    for distributor in (makeUniform(0, 100, samplesPerDistribution, seed),
            makeNormal(50, 15, samplesPerDistribution, seed+1),
            makeBimodal(50, 15, samplesPerDistribution, seed+3),
            makeExponential(10, samplesPerDistribution, seed+17),
            makeReverseExponential(10, samplesPerDistribution, seed+19)):
        howManyTimesYielded += 1
        if howManyTimesYielded > howManyTimesToYield:
            raise RuntimeError('ERROR, Missing "yield None" as a '
                + 'sentinel value in the pipeline?')
        # Create and run 3 distinct PULL pipelines.
        stage1 = generateDistribution(distributor, howManyTimesToYield)
        stage2 = generateStatisticalAnalysis(stage1)
        stage3 = saveStatisticalAnalysisCSV(stage2, outcsv, writeCSVheader)
        writeCSVheader = False
        for generatedValue in stage3:
            if generatedValue == None:  # sentinel value, next distributor
                break   # out of inner for loop
    outfile.close()