# aligncsv.py, D. Parson, April 19, 2023 # Script to lengthen lines in a CSV file that are too short # (missing comma-separated fields), or are too long (extra fields # on the right side), that give an error when reading into Weka: # "wrong number of values, Read N1, expected N2" error from Weka. # See https://docs.python.org/3/library/csv.html import sys import os import csv def __alignrow__(therow, itslen): ''' Pure function (no mutation) to return a copy of therow with length itslen. If len(therow) already == itslen, returns therow. ''' result = therow if len(therow) < itslen: result = therow + [None for i in range(len(therow), itslen)] # Place unknown values in the new slots. elif len(therow) > itslen: result = therow[0:itslen] # a truncated copy return result __usage__ = ''' python aligncsv.py infile.csv [-]outfile.csv first|last|min|max|N, where infile.csv is the input file with possible extra/missing fields, outfile.csv is the cleaned file to write, optional "-" forces deletion, first means take the length from the first row of data, last means take the length from the last row of data, min means take the length from the shortest row of data, max means take the length from the longest row of data, N means take the length of N attributes ''' if __name__ == '__main__': # Invoked from command-line, not via import. if len(sys.argv) != 4: # The Python script name is sys.argv[0]. raise ValueError(__usage__) infile = open(sys.argv[1], 'r') if sys.argv[2].startswith('-') and len(sys.argv[2]) > 1: outfile = open(sys.argv[2][1:], 'w') else: if os.path.exists(sys.argv[2]): raise ValueError('ERROR, ' + sys.argv[2] + ' exists, please remove.') outfile = open(sys.argv[2], 'w') first = None last = None mini = None maxi = None finalLength = -1 # dummy poison value constraint = None if sys.argv[3] in ['first','last','min','max']: constraint = sys.argv[3] else: finalLength = int(sys.argv[3]) constraint = 'number' if finalLength < 1: raise ValueError('ERROR, ' + sys.argv[3] + ' must be > 0.') incsv = csv.reader(infile) outcsv = csv.writer(outfile, delimiter = ',', quotechar = '"') outrows = [] for inrow in incsv: outrow = inrow # May make new list object via non-mutation. last = len(inrow) # Update each time (simple). if first == None: first = len(inrow) mini = first maxi = first if constraint == 'first': finalLength = len(inrow) elif constraint == 'number': outrow = __alignrow__(inrow, finalLength) elif constraint == 'first' or constraint == 'number': outrow = __alignrow__(inrow, finalLength) else: mini = min(mini,len(inrow)) maxi = max(maxi,len(inrow)) outrows.append(outrow) infile.close() if constraint != 'first' and constraint != 'number': # those were done if constraint == 'min': finalLength = mini elif constraint == 'max': finalLength = maxi elif constraint == 'last': finalLength = last else: raise ValueError('INTERNAL ERROR, Invalid constraint: ' + constraint) for rowix in range(0,len(outrows)): currow = outrows[rowix] outrows[rowix] = __alignrow__(currow, finalLength) outcsv.writerows(outrows) outfile.close()