# CPSC223_PostFinal_ML.py last class demo, Fall2024, D. Parson from sklearn.linear_model import LinearRegression from scipy.stats import pearsonr def pcc(seq1, seq2): return round(float(pearsonr(seq1, seq2)[0]),6) from scipy.stats import spearmanr def scc(seq1, seq2): return round(float(spearmanr(seq1, seq2)[0]),6) from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import root_mean_squared_error as rmse lregress = LinearRegression() linear100 = [i for i in range(0,100)] linearp7 = [i+7 for i in linear100] trainingLinear100 = [[ele] for ele in linear100] # scikit-learn requires rows on non-target attributes to be in a list per row, # where the target attribute is what we are trying to predict lregress.fit(trainingLinear100, linearp7) # predicting linearp7 # Out[239]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) predicted = lregress.predict(trainingLinear100) print('unmodeled measures for linear100, linearp7') print(linear100[0:10]) print(linearp7[0:10]) for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(linear100, linearp7)) print('measures for trainingLinear100 predicted, linearp7') for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(predicted, linearp7)) print('lregress.coef_', lregress.coef_) input('\nHit Enter to continue\n') # pcc 1.0 # scc 1.0 # mae 0.0 # rmse 0.0 print('predicted == linearp7', predicted == linearp7) # Out[242]: # array([False, False, False, False, False, False, False, False, False, # False, False, False, False, False, False, False, False, False, # False, False, False, False, False, False, False, False, False, # False, False, False, False, False, False, False, False, False, # False, False, False, False, False, False, False, False, False, # False, False, False, True, True, True, True, True, True, # True, True, True, True, True, True, True, True, True, # True, True, True, True, True, True, True, True, True, # True, True, True, True, True, True, True, True, True, # True, True, True, True, True, True, True, True, True, # True, True, True, True, True, True, False, False, False, # False]) from math import isclose # isclose(a, b, *, rel_tol=1e-09, abs_tol=0.0) # Determine whether two floating point numbers are close in value. # rel_tol # maximum difference for being considered "close", relative to the # magnitude of the input values # abs_tol # maximum difference for being considered "close", regardless of the # magnitude of the input values def almostEquals(seq1, seq2): result = True for ix in range(len(seq1)): result = result and isclose(seq1[ix], seq2[ix]) return result print('almostEquals(predicted, linearp7)', almostEquals(predicted, linearp7)) # Out[246]: True print('predicted[0]', predicted[0]) # Out[247]: 6.999999999999986 print('linearp7[0]', linearp7[0]) # Out[248]: 7 # print('mean_absolute_error(predicted, linearp7)', # mae(predicted, linearp7)) # Out[249]: 5.186961971048731e-15 # print('sqrt(mean_squared_error(predicted, linearp7))', # rmse(predicted, linearp7)) # Out[250]: 7.494457715654685e-15 input('\nHit Enter to continue\n') linearX4p7 = [i*4+7 for i in linear100] print('lregress.fit(trainingLinear100, linearX4p7)', lregress.fit(trainingLinear100, linearX4p7)) # Out[266]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) print('lregress.coef_', lregress.coef_) # Out[267]: array([4.]) predicted = lregress.predict(trainingLinear100) print('unmodeled measures for linear100, linearX4p7') print(linear100[0:10]) print(linearX4p7[0:10]) for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(linear100, linearX4p7)) print('measures for trainingLinear100 predicted, linearX4p7') for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(predicted, linearX4p7)) # pcc 1.0 # scc 1.0 # mae 0.0 # rmse 0.0 print('predicted[0]', predicted[0]) # Out[270]: 6.999999999999943 print('linearX4p7[0]', linearX4p7[0]) # Out[271]: 7 input('\nHit Enter to continue\n') step4 = [linear100[ix]//4*4*(-1 if ((ix%4) < 2) else 1) \ for ix in range(len(linear100))] lregress.fit(trainingLinear100, step4) # Out[272]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) print('lregress.coef_', lregress.coef_) # Out[273]: array([0.05760576]) predicted = lregress.predict(trainingLinear100) print('unmodeled measures for linear100, step4') print(linear100[0:10]) print(step4[0:10]) for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(linear100, step4)) print('measures for trainingLinear100 predicted, step4') for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(predicted, step4)) input('\nHit Enter to continue\n') # pcc 0.029694 # scc 0.02996 # mae 48.055302 # rmse 55.975306 from sklearn.tree import DecisionTreeRegressor, export_text def printTree(model, name): treestructure = export_text(model,feature_names=['linear100'], show_weights=False, max_depth=1000) # sklearn.tree.export_text treedepth = 0 print('tree ', name) for line in treestructure.split('\n'): print(line) input('\nHit Enter to continue\n') dtr = DecisionTreeRegressor() dtr.fit(trainingLinear100, step4) predicted = dtr.predict(trainingLinear100) printTree(dtr, 'dtr') print('measures for DecisionTreeRegressor, step4') for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(predicted, step4)) input('\nHit Enter to continue\n') # pcc 1.0 # scc 1.0 # mae 0.0 # rmse 0.0 dtr2 = DecisionTreeRegressor(max_depth=6) dtr2.fit(trainingLinear100, step4) predicted = dtr2.predict(trainingLinear100) printTree(dtr2, 'dtr2') print('measures for DecisionTreeRegressor max_depth=6, step4') for name, measure in (('pcc', pcc), ('scc', scc), ('mae', mae), ('rmse', rmse)): print (name, measure(predicted, step4)) input('\nHit Enter to continue\n') # pcc 0.569461 # scc 0.564467 # mae 36.96 # rmse 46.033032 # Save a .csv of linear100, step4 for Weka M5P model tree analysis: import csv f = open('linear100_step4.csv', 'w', newline='') fcsv = csv.writer(f, delimiter=',', quotechar='"') fcsv.writerow(['linear100', 'step4']) rows = [[linear100[ix], step4[ix]] for ix in range(len(linear100))] fcsv.writerows(rows) f.close()