# Script extractAudioFreqARFF.py based on exploratory work done in
# extractFrequencySeriesData.py. D. Parson June 2021. This script uses a
# Python WAV audio file library to read a series of .wav files and,
# one at a time, extracts normalized frequency-domain attrubutes from them,
# along with a waveform type tag extracted from a file name like below:
#       SinOsc                Sin waveform
#       TriOsc                Triangle waveform
#       SqrOsc                Square waveform
#       SawOsc                Sawtooth waveform
#       PulseOsc              Pulse waveform
# /Users/parson/csc558_spring2020/assn1/genwaves/lazy2020data/waves/lazy1_SqrOsc_993_0.574001525357_0.189810423535_659894.wav
# It normalizes the first Nharmonics as ratios of amplitude relative to the
# first harmonic of amplitude 1.0, and the frequency of the harmonic as
# a multiple of the fundamental 1.0, and makes the Nharmonics into Weka
# attributes for classification of the waveform type. Note that non-harmonic
# signals above the fundamental with adequate amplitude may be in noisy data.
# See http://faculty.kutztown.edu/parson/spring2020/CSC558Audio1_2020.html
#   for docs on audio data.
# See https://realpython.com/python-scipy-fft/#using-the-fast-fourier-transform-fft
# See https://docs.scipy.org/doc/scipy/reference/tutorial/fft.html
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.fft.fft.html

from scipy.io import wavfile
import scipy.io
# from numpy import fft
from scipy.fft import fft, fftfreq, rfft
import numpy as np
import csv
import sys
import os

Nharmonics = 32             # How many stringest bins to use from fft freqs.
NRndTrainLimit = 5          # How many of each in NRndTrain.arff, tid != 0
NRndTrainCount = {
    'SinOsc' : 0 ,
    'TriOsc' : 0 ,
    'SqrOsc' : 0 ,
    'SawOsc' : 0 ,
    'PulseOsc' : 0
}

def wav2arff(fpath, openarffcsv, noiselesscsv, noiseycsv, rndcsv):
    '''
    Normalize the first Nharmonics of the WAV file in parameter *fpath*
    as ratios of amplitude relative to the first harmonic of amplitude 1.0,
    and the frequency of the harmonic as a multiple of the fundamental 1.0,
    and makes the Nharmonics into Weka attributes for classification of the
    waveform type. Note that non-harmonic signals above the fundamental with
    adequate amplitude may be in noisy data. This function writes one row
    of data to the open CSV file *openarffcsv* via writeRow().
    Base string for fpath must embed type tag as the second underline-separated
    field like this:
        lazy1_SqrOsc_993_0.574001525357_0.189810423535_659894.wav
    Return value is None.
    '''
    fbase = fpath.split('/')[-1].replace('.wav', '')
    toosc = fbase.split('_')[1].strip()         # The oscillator class.
    tfreq = float(fbase.split('_')[2].strip())
    toscgn = float(fbase.split('_')[3].strip())
    tnoign = float(fbase.split('_')[4].strip())
    tid = int(fbase.split('_')[5].strip())
    # print("DEBUG 1", fpath, fbase, toosc)
    samplerate, data = wavfile.read(fpath)
    # normdata = []
    # for ix in range(0, len(data)):
        # normdata.append(data[ix]/32768.0)
        # normalize to [-1.0,1.0] for 16 bit samples
    normdata = list(data)
    complexfft = fft(normdata)
    absfft = np.abs(complexfft)

    # 6/4/2021, 6/6/2021 FFT ANALYSIS:
    # discard mirror image right of center, sort on amplitude.
    freqstep = 1    # needed to find fundamental frequency and the harmonics
    sortedfft = []
    for ix in range(0, int(len(absfft)/2)):
        sortedfft.append([absfft[ix], freqstep])
        freqstep += 1
    sortedfft.sort(reverse=True, key=lambda inst : inst[0])
    # sort on frequencyAmplitude
    fundamentalAmplitude = sortedfft[0][0] * 1.0
    fundamentalFrequency = sortedfft[0][1] * 1.0
    wekaDataRow = []
    for inst in sortedfft[0:Nharmonics]:
        # round to nearest multiple of the fundamental
        # Because of aliasing we need to keep some fractional digits.
        wekaDataRow.extend([round(inst[0]/fundamentalAmplitude, 6),
            round(inst[1]/fundamentalFrequency, 6)])
    # wekaDataRow.extend([toosc])
    wekaDataRow.extend([toosc, tfreq, toscgn, tnoign, tid])
    # Used in csc558 fall 2021
    openarffcsv.writerow(wekaDataRow)
    if tid == 0:
        noiselesscsv.writerow(wekaDataRow)
    if tid in [981018, 738502, 526474, 126978, 997716]:
        noiseycsv.writerow(wekaDataRow)
    if tid != 0 and NRndTrainCount[toosc] < NRndTrainLimit:
        rndcsv.writerow(wekaDataRow)
        NRndTrainCount[toosc] = NRndTrainCount[toosc] + 1
    return None

__USAGE__ =                                                         \
'python extractAudioFreqARFF.py moduleWithWavPaths outARFFname [ Nharmonics ]'

if __name__ == '__main__':
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        sys.stderr.write("USAGE: " + __USAGE__ + '\n')
        sys.exit(1)
    exec("from " + sys.argv[1] + " import WavPathsList")
    # That module must define sequence WavPathsList.
    if os.path.exists(sys.argv[2]):
        sys.stderr.write("ERROR, " + sys.argv[2] + " ALREADY EXISTS.\n")
        sys.exit(1)
    if len(sys.argv) == 4:
        Nharmonics = int(sys.argv[3])
        if Nharmonics < 8:
            sys.stderr.write("ERROR, Nharmonics must be at least 8: "
                + sys.argv[3] + '\n')
            sys.exit(1)
    Nh = str(Nharmonics)
    openarfffile = open(sys.argv[2], 'w')
    noiseless = open('csc558lazytrain5fa2021' + Nh + '.arff', 'w')
    noisey = open('csc558lazynoise5fa2021' + Nh + '.arff', 'w')
    rndarff = open('NRndTrain' + Nh + '.arff', 'w')
    for of in [openarfffile, noiseless, noisey, rndarff]:
        of.write("@relation 'extractAudioFreqARFF_"
            + sys.argv[1] + "'\n")
        for i in range(1, Nharmonics+1):
            of.write("@attribute ampl" + str(i) + " numeric\n")
            of.write("@attribute freq" + str(i) + " numeric\n")
        of.write("@attribute toosc {PulseOsc, SawOsc, SinOsc, SqrOsc, TriOsc}\n")
        of.write("@attribute tfreq numeric\n")
        of.write("@attribute toscgn numeric\n")
        of.write("@attribute tnoign numeric\n")
        of.write("@attribute tid numeric\n")
        of.write("@data\n")
        of.flush()
    openarffcsv = csv.writer(openarfffile, delimiter=',', quotechar='"')
    noiselesscsv = csv.writer(noiseless, delimiter=',', quotechar='"')
    noiseycsv = csv.writer(noisey, delimiter=',', quotechar='"')
    rndcsv = csv.writer(rndarff, delimiter=',', quotechar='"')
    for path in WavPathsList:
        wav2arff(path, openarffcsv, noiselesscsv, noiseycsv, rndcsv)
    openarfffile.close()
    noiseless.close()
    noisey.close()
    rndarff.close()