# Update 3/1/2023 It appears to be pulling in low freq pulse noise
# below 100 Hz for PulseOsc, so start looking above that point.
# Script extractAudioFreqARFF.py based on exploratory work done in
# extractFrequencySeriesData.py. D. Parson June 2021. This script uses a
# Python WAV audio file library to read a series of .wav files and,
# one at a time, extracts normalized frequency-domain attrubutes from them,
# along with a waveform type tag extracted from a file name like below:
#       SinOsc                Sin waveform
#       TriOsc                Triangle waveform
#       SqrOsc                Square waveform
#       SawOsc                Sawtooth waveform
#       PulseOsc              Pulse waveform
# /Users/parson/csc558_spring2020/assn1/genwaves/lazy2020data/waves/lazy1_SqrOsc_993_0.574001525357_0.189810423535_659894.wav
# It normalizes the first Nharmonics as ratios of amplitude relative to the
# first harmonic of amplitude 1.0, and the frequency of the harmonic as
# a multiple of the fundamental 1.0, and makes the Nharmonics into Weka
# attributes for classification of the waveform type. Note that non-harmonic
# signals above the fundamental with adequate amplitude may be in noisy data.
# See http://faculty.kutztown.edu/parson/spring2020/CSC558Audio1_2020.html
#   for docs on audio data.
# See https://realpython.com/python-scipy-fft/#using-the-fast-fourier-transform-fft
# See https://docs.scipy.org/doc/scipy/reference/tutorial/fft.html
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.fft.fft.html
#
# N.P.: This version extractAudioFreqARFF23Oct2021.py adds the following
# attributes from the wav file name that you can ignore (for csc558):
#   tfreq = float(fbase.split('_')[2].strip())
#   toscgn = float(fbase.split('_')[3].strip())
#   tnoign = float(fbase.split('_')[4].strip())
#   tid = int(fbase.split('_')[5].strip())

# The main add for CSC 558 fall 2021 is to generate an additional ARFF files
# with the prefix "csc558FFT" that does NOT create amplitude and frequency
# multiples of the fundamental frequency in ampl1, freq1 ... ampl32, freq32.
# Instead it writes an ARFF file that just contains NFFTbins bins from
# the initial FFT analysis without sorting. This is because the original
# sorted version emphasizes FFT peaks, but the tnoign attribute of assn2
# shows up more prominently in the FFT troughs. N.P., this may be useful
# to retain in your eclipsing binary star ARFF file. I don't know.
# This version hard codes names csc558fa2021AudioHarmonicData.arff and
# csc558FFTfa2021AudioHarmonicData.arff since all other output file names
# are hard coded. WAV module still must be on command line.

from scipy.io import wavfile
import scipy.io
# from numpy import fft
from scipy.fft import fft, fftfreq, rfft
import numpy as np
import csv
import sys
import os
from pprint import pprint

Nharmonics = 32             # How many strongest bins to use from fft freqs.
# NFFTharmonics = 96          # How many strongest bins to use without sort.
NFFTharmonics = 128          # How many strongest bins to use without sort.
# DEBUG LEN OF unsortedFFT: 22050 NFFTharmonics for sampling rate 44100
NRndTrainLimit = 5          # How many of each in NRndTrain.arff, tid != 0
NRndTrainCount = {
    'SinOsc' : 0 ,
    'TriOsc' : 0 ,
    'SqrOsc' : 0 ,
    'SawOsc' : 0 ,
    'PulseOsc' : 0
}

def wav2arff(fpath, openarffcsv, noiselesscsv, noiseycsv, rndcsv,
    fftfilecsv, fftnoiselesscsv, fftnoiseycsv, fftrndarffcsv):
    '''
    Normalize the first Nharmonics of the WAV file in parameter *fpath*
    as ratios of amplitude relative to the first harmonic of amplitude 1.0,
    and the frequency of the harmonic as a multiple of the fundamental 1.0,
    and makes the Nharmonics into Weka attributes for classification of the
    waveform type. Note that non-harmonic signals above the fundamental with
    adequate amplitude may be in noisy data. This function writes one row
    of data to the open CSV file *openarffcsv* via writeRow().
    Base string for fpath must embed type tag as the second underline-separated
    field like this:
        lazy1_SqrOsc_993_0.574001525357_0.189810423535_659894.wav
    10/23/2021 N.P. add fftfilecsv, fftnoiselesscsv, fftnoiseycsv,
        fftrndarffcsv files for unsorted FFT.
    Return value is None.
    '''
    if '/' in fpath:
        ri = fpath.rindex('/')
        fbase = fpath[ri+1:]
    elif '\\' in fpath:
        fixpath = fpath.replace("\\","/")
        # print("DEBUG fixpath",fixpath)
        # ri = fpath.rindex('\\')
        ri = fixpath.rindex('/')
        # print("DEBUG FOUND BACKSLASH",ri)
        fbase = fixpath[ri+1:]
    else:
        # print("DEBUG NOT HERE")
        fbase = fpath
    # fbase = fpath.split('/')[-1].replace('.wav', '')
    fbase = fbase.replace('.wav', '')
    # print("DEBUG fbase", fbase)
    toosc = fbase.split('_')[1].strip()         # The oscillator class.
    tfreq = float(fbase.split('_')[2].strip())
    toscgn = float(fbase.split('_')[3].strip())
    tnoign = float(fbase.split('_')[4].strip())
    tid = int(fbase.split('_')[5].strip())
    # print("DEBUG 1", fpath, fbase, toosc)
    samplerate, data = wavfile.read(fpath)
    # normdata = []
    # for ix in range(0, len(data)):
        # normdata.append(data[ix]/32768.0)
        # normalize to [-1.0,1.0] for 16 bit samples
    normdata = list(data)
    complexfft = fft(normdata)
    absfft = np.abs(complexfft)
    maxsig = (-1, -1000000)
    if tid == 234361:
        for index in range(100, len(absfft)//2):
            print('tid 234361 index', index, absfft[index])
            if absfft[index] > maxsig[1]:
                maxsig = (index, absfft[index])
        print('tid 234361 PEAK at', maxsig)

    # 6/4/2021, 6/6/2021 FFT ANALYSIS:
    # discard mirror image right of center, sort on amplitude.
    freqstep = 1    # needed to find fundamental frequency and the harmonics
    sortedfft = []
    unsortedFFT = []            # N.P.
    # sort is pulling in low-frequency pulse noise below 100 Hz,
    # or possibly low-freq white noise for sine waves, so cut those out:
    nyquist = samplerate / 2.0      # 3/1/2023
    perbin = nyquist / int(len(absfft)/2) # 3/1/2023
    numbinsBelow100 = int(100 / perbin) # 3/1/2023
    # print("DEBUG numbinsBelow100 ", numbinsBelow100)
    for ix in range(0, int(len(absfft)/2)):
        if ix >= numbinsBelow100:   # 3/1/2023
            sortedfft.append([absfft[ix], freqstep]) 
        unsortedFFT.append(absfft[ix])          # N.P.
        freqstep += 1
    sortedfft.sort(reverse=True, key=lambda inst : inst[0])
    # sort on frequencyAmplitude
    fundamentalAmplitude = sortedfft[0][0] * 1.0
    fundamentalFrequency = sortedfft[0][1] * 1.0
    wekaDataRow = []
    for inst in sortedfft[0:Nharmonics]:
        # round to nearest multiple of the fundamental
        # Because of aliasing we need to keep some fractional digits.
        wekaDataRow.extend([round(inst[0]/fundamentalAmplitude, 6),
            round(inst[1]/fundamentalFrequency, 6)])
    wekaDataRow.extend([toosc, tfreq, toscgn, tnoign, tid])
    # N.P. start
    FFTSUM = 0.0        # Average all measured levels for white noise.
    wekafftRow = []
    # print("DEBUG LEN OF unsortedFFT:", len(unsortedFFT))
    # DEBUG LEN OF unsortedFFT: 22050 NFFTharmonics for sampling rate 44100
    for value in unsortedFFT[0:NFFTharmonics]:
        v = float(value)
        FFTSUM += v
        wekafftRow.append(round(v, 6))
    for value in unsortedFFT[NFFTharmonics:]:
        v = float(value)
        FFTSUM += v
    AVGFFT = FFTSUM / len(unsortedFFT)
    wekafftRow.extend([AVGFFT, toosc, tfreq, toscgn, tnoign, tid])
    fftfilecsv.writerow(wekafftRow)
    # N.P. also additional wekafftRow writes below
    openarffcsv.writerow(wekaDataRow)
    if tid == 0:
        noiselesscsv.writerow(wekaDataRow)
        fftnoiselesscsv.writerow(wekafftRow)
    if tid in [981018, 738502, 526474, 126978, 997716]:
        noiseycsv.writerow(wekaDataRow)
        fftnoiseycsv.writerow(wekafftRow)
    if tid != 0 and NRndTrainCount[toosc] < NRndTrainLimit:
        rndcsv.writerow(wekaDataRow)
        fftrndarffcsv.writerow(wekafftRow)
        NRndTrainCount[toosc] = NRndTrainCount[toosc] + 1
    return None

__USAGE__ =                                                         \
'python extractAudioFreqARFF.py moduleWithWavPaths'
# 'python extractAudioFreqARFF.py moduleWithWavPaths outARFFname [ Nharmonics ]'

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.stderr.write("USAGE: " + __USAGE__ + '\n')
        sys.exit(1)
    exec("from " + sys.argv[1] + " import WavPathsList")
    # That module must define sequence WavPathsList.
#   if os.path.exists(sys.argv[2]):
#       sys.stderr.write("ERROR, " + sys.argv[2] + " ALREADY EXISTS.\n")
#       sys.exit(1)
#   if len(sys.argv) == 4:
#       Nharmonics = int(sys.argv[3])
#       if Nharmonics < 8:
#           sys.stderr.write("ERROR, Nharmonics must be at least 8: "
#               + sys.argv[3] + '\n')
#           sys.exit(1)
    Nh = str(Nharmonics)
    openarfffile = open('csc558fa2021AudioHarmonicData_' + Nh + '.arff', 'w')
    noiseless = open('csc558lazytrain5fa2021_' + Nh + '.arff', 'w')
    noisey = open('csc558lazynoise5fa2021_' + Nh + '.arff', 'w')
    rndarff = open('NRndTrain_' + Nh + '.arff', 'w')
    for of in [openarfffile, noiseless, noisey, rndarff]:
        of.write("@relation 'extractAudioFreqARFF_"
            + sys.argv[1] + "'\n")
        for i in range(1, Nharmonics+1):
            of.write("@attribute ampl" + str(i) + " numeric\n")
            of.write("@attribute freq" + str(i) + " numeric\n")
        of.write("@attribute toosc {PulseOsc, SawOsc, SinOsc, SqrOsc, TriOsc}\n")
        of.write("@attribute tfreq numeric\n")
        of.write("@attribute toscgn numeric\n")
        of.write("@attribute tnoign numeric\n")
        of.write("@attribute tid numeric\n")
        of.write("@data\n")
        of.flush()
    openarffcsv = csv.writer(openarfffile, delimiter=',', quotechar='"')
    noiselesscsv = csv.writer(noiseless, delimiter=',', quotechar='"')
    noiseycsv = csv.writer(noisey, delimiter=',', quotechar='"')
    rndcsv = csv.writer(rndarff, delimiter=',', quotechar='"')
    
    # N.P. start of add for unsorted FFT bin files:
    Nfft = str(NFFTharmonics)
    fftfile = open('csc558FFTfa2021AudioHarmonicData_' + Nfft + '.arff', 'w')
    fftnoiseless = open('csc558FFTlazytrain5fa2021_' + Nfft + '.arff', 'w')
    fftnoisey = open('csc558FFTlazynoise5fa2021_' + Nfft + '.arff', 'w')
    fftrndarff = open('NRndFFTTrain_' + Nfft + '.arff', 'w')
    for of in [fftfile, fftnoiseless, fftnoisey, fftrndarff]:
        of.write("@relation 'extractAudioFreqFFTARFF_"
            + sys.argv[1] + "'\n")
        for i in range(1, NFFTharmonics+1):
            of.write("@attribute amplfft" + str(i) + " numeric\n")
        of.write("@attribute AVGFFT numeric\n")
        of.write("@attribute toosc {PulseOsc, SawOsc, SinOsc, SqrOsc, TriOsc}\n")
        of.write("@attribute tfreq numeric\n")
        of.write("@attribute toscgn numeric\n")
        of.write("@attribute tnoign numeric\n")
        of.write("@attribute tid numeric\n")
        of.write("@data\n")
        of.flush()
    fftfilecsv = csv.writer(fftfile, delimiter=',', quotechar='"')
    fftnoiselesscsv = csv.writer(fftnoiseless, delimiter=',', quotechar='"')
    fftnoiseycsv = csv.writer(fftnoisey, delimiter=',', quotechar='"')
    fftrndarffcsv = csv.writer(fftrndarff, delimiter=',', quotechar='"')

    for path in WavPathsList:
        wav2arff(path, openarffcsv, noiselesscsv, noiseycsv, rndcsv,
          fftfilecsv, fftnoiselesscsv, fftnoiseycsv, fftrndarffcsv) # N.P. add
    openarfffile.close()
    noiseless.close()
    noisey.close()
    rndarff.close()
    # N.P. close new files, end of add for unsorted FFT bin files
    fftfile.close()
    fftnoiseless.close()
    fftnoisey.close()
    fftrndarff.close()