# Java from https://www.openml.org/a/evaluation-measures/kappa # Python, Parson, 10/1/2019: # Original code uncommented, comments & examples added by Parson. # It compares this classifier's observed to this classifier's expected, # not to ZeroR's expected. import sys import math def kappa(confusionMatrix): # 2D lists of lists, each sublist is a row sumRows = [0.0 for i in range(0,len(confusionMatrix))] sumColumns = [0.0 for i in range(0,len(confusionMatrix))] sumOfWeights = 0.0 for i in range(0, len(confusionMatrix)): for j in range(0, len(confusionMatrix)): sumRows[i] += confusionMatrix[i][j] sumColumns[j] += confusionMatrix[i][j]; sumOfWeights += confusionMatrix[i][j]; correct = 0.0 chanceAgreement = 0.0 DEBUGchanceAgreement = 0.0 # Added by Parson for figuring this out. for i in range(0, len(confusionMatrix)): # sumRows[i] gives sum that should have been classified as class i # sumColumns[i] sum that were classified as i (maybe some wrong) # This squares the correct diagonal classification terms. # Other terms are sums of incorrectly swapped classes, e.g., # (in class a but classified as b * in class b but classified as a) chanceAgreement += (sumRows[i] * sumColumns[i]) correct += confusionMatrix[i][i] # predicted == correct on diagonal if sumRows[i] > DEBUGchanceAgreement: DEBUGchanceAgreement = sumRows[i] chanceAgreement /= (sumOfWeights * sumOfWeights) correct /= sumOfWeights DEBUGchanceAgreement /= sumOfWeights for row in confusionMatrix: for datum in row: sys.stdout.write(str(datum) + ",\t") sys.stdout.write('\n') print("chanceAgreement:",chanceAgreement,"ZeroR",DEBUGchanceAgreement) print("correct: ", correct) if (chanceAgreement < 1): # This is the actual formula: # Kappa = (observed accuracy - expected accuracy) # / (1 - expected accuracy) return (correct - chanceAgreement) / (1 - chanceAgreement) else: return 1.0 # A ZeroR confusion matrix from a past project: # a b c d e f g h i j <-- classified as # 1821 0 0 0 0 0 0 0 0 0 | a = '(-inf-0.5]' # 107 0 0 0 0 0 0 0 0 0 | b = '(0.5-1.5]' # 58 0 0 0 0 0 0 0 0 0 | c = '(1.5-2.5]' # 43 0 0 0 0 0 0 0 0 0 | d = '(2.5-3.5]' # 44 0 0 0 0 0 0 0 0 0 | e = '(3.5-5.5]' # 38 0 0 0 0 0 0 0 0 0 | f = '(5.5-8.5]' # 34 0 0 0 0 0 0 0 0 0 | g = '(8.5-13.5]' # 36 0 0 0 0 0 0 0 0 0 | h = '(13.5-30.5]' # 37 0 0 0 0 0 0 0 0 0 | i = '(30.5-87.5]' # 37 0 0 0 0 0 0 0 0 0 | j = '(87.5-inf)' ZeroR = [ [1821 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [107 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [58 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [43 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [44 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [38 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [34 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [36 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [37 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0], [37 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0] ] # A J48 confusion matrix from a past project: # a b c d e f g h i j <-- classified as # 1753 21 15 4 5 2 1 6 10 4 | a = '(-inf-0.5]' # 74 17 4 1 2 0 4 2 0 3 | b = '(0.5-1.5]' # 39 8 4 1 4 2 0 0 0 0 | c = '(1.5-2.5]' # 24 5 3 0 3 3 1 1 2 1 | d = '(2.5-3.5]' # 30 2 2 1 3 0 3 1 2 0 | e = '(3.5-5.5]' # 19 5 3 1 0 2 2 3 3 0 | f = '(5.5-8.5]' # 15 5 0 3 3 4 1 1 1 1 | g = '(8.5-13.5]' # 18 2 0 1 1 3 3 3 2 3 | h = '(13.5-30.5]' # 14 2 1 1 3 1 0 5 6 4 | i = '(30.5-87.5]' # 18 3 0 1 1 0 0 6 4 4 | j = '(87.5-inf)' J48_Q11_Fall2019_assn2 = [ [1753 , 21 , 15 , 4 , 5 , 2 , 1 , 6 , 10 , 4], [74 , 17 , 4 , 1 , 2 , 0 , 4 , 2 , 0 , 3], [39 , 8 , 4 , 1 , 4 , 2 , 0 , 0 , 0 , 0], [24 , 5 , 3 , 0 , 3 , 3 , 1 , 1 , 2 , 1], [30 , 2 , 2 , 1 , 3 , 0 , 3 , 1 , 2 , 0], [19 , 5 , 3 , 1 , 0 , 2 , 2 , 3 , 3 , 0], [15 , 5 , 0 , 3 , 3 , 4 , 1 , 1 , 1 , 1], [18 , 2 , 0 , 1 , 1 , 3 , 3 , 3 , 2 , 3], [14 , 2 , 1 , 1 , 3 , 1 , 0 , 5 , 6 , 4], [18 , 3 , 0 , 1 , 1 , 0 , 0 , 6 , 4 , 4] ] OneR_Q9_Fall2019_assn2 = [ [1806,5,0,0,3,0,0,0,0,7], [106,0,0,0,0,0,0,0,0,1], [57,1,0,0,0,0,0,0,0,0], [42,1,0,0,0,0,0,0,0,0], [44,0,0,0,0,0,0,0,0,0], [34,1,0,0,0,0,0,0,0,3], [33,1,0,0,0,0,0,0,0,0], [33,0,0,0,0,0,0,0,0,3], [30,0,0,0,0,0,0,0,0,7], [28,1,0,0,0,0,0,0,0,8], ] Perfect = [ [2,0,0,0,0], [0,2,0,0,0], [0,0,1,0,0], [0,0,0,2,0], [0,0,0,0,2] ] Terrible = [ [0,1,1,0,0], [1,0,0,1,0], [0,1,0,0,1], [1,0,0,0,0], [1,0,1,0,0] ] TooMany_I1_TooFew_I1 = [ [1,0,0,0,0], [1,1,0,1,0], [0,1,1,0,0], [0,0,0,1,0], [0,1,0,0,1] ] TooMany_I1 = [ [1,0,0,0,0], [1,1,1,1,1], [0,0,1,0,0], [0,0,0,1,0], [0,0,0,0,1] ] TooFew_I1 = [ [1,1,0,0,0], [0,1,0,0,0], [0,1,1,0,0], [0,1,0,1,0], [0,1,0,0,1] ] Scattered = [ [1,1,0,0,0], [0,1,0,1,0], [0,0,1,0,1], [0,0,0,1,0], [1,0,0,0,1] ] # TEST RUN OUTPUT: # $ python -i kappa.py # >>> kappa(Perfect) # 2, 0, 0, 0, 0, # 0, 2, 0, 0, 0, # 0, 0, 1, 0, 0, # 0, 0, 0, 2, 0, # 0, 0, 0, 0, 2, # ('chanceAgreement:', 0.20987654320987653, 'ZeroR', 0.2222222222222222) # ('correct: ', 1.0) # 1.0 # >>> kappa(TooMany_I1) # 1, 0, 0, 0, 0, # 1, 1, 1, 1, 1, # 0, 0, 1, 0, 0, # 0, 0, 0, 1, 0, # 0, 0, 0, 0, 1, # ('chanceAgreement:', 0.16049382716049382, 'ZeroR', 0.5555555555555556) # ('correct: ', 0.5555555555555556) # 0.4705882352941177 # >>> kappa(TooFew_I1) # 1, 1, 0, 0, 0, # 0, 1, 0, 0, 0, # 0, 1, 1, 0, 0, # 0, 1, 0, 1, 0, # 0, 1, 0, 0, 1, # ('chanceAgreement:', 0.16049382716049382, 'ZeroR', 0.2222222222222222) # ('correct: ', 0.5555555555555556) # 0.4705882352941177 # >>> kappa(TooMany_I1_TooFew_I1) # 1, 0, 0, 0, 0, # 1, 1, 0, 1, 0, # 0, 1, 1, 0, 0, # 0, 0, 0, 1, 0, # 0, 1, 0, 0, 1, # ('chanceAgreement:', 0.20987654320987653, 'ZeroR', 0.3333333333333333) # ('correct: ', 0.5555555555555556) # 0.43750000000000006 # >>> kappa(Scattered) # 1, 1, 0, 0, 0, # 0, 1, 0, 1, 0, # 0, 0, 1, 0, 1, # 0, 0, 0, 1, 0, # 1, 0, 0, 0, 1, # ('chanceAgreement:', 0.19753086419753085, 'ZeroR', 0.2222222222222222) # ('correct: ', 0.5555555555555556) # 0.4461538461538462 # >>> kappa(Terrible) # 0, 1, 1, 0, 0, # 1, 0, 0, 1, 0, # 0, 1, 0, 0, 1, # 1, 0, 0, 0, 0, # 1, 0, 1, 0, 0, # ('chanceAgreement:', 0.20987654320987653, 'ZeroR', 0.2222222222222222) # ('correct: ', 0.0) # -0.265625 # >>> kappa(OneR_Q9_Fall2019_assn2) # 1806, 5, 0, 0, 3, 0, 0, 0, 0, 7, # 106, 0, 0, 0, 0, 0, 0, 0, 0, 1, # 57, 1, 0, 0, 0, 0, 0, 0, 0, 0, # 42, 1, 0, 0, 0, 0, 0, 0, 0, 0, # 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 34, 1, 0, 0, 0, 0, 0, 0, 0, 3, # 33, 1, 0, 0, 0, 0, 0, 0, 0, 0, # 33, 0, 0, 0, 0, 0, 0, 0, 0, 3, # 30, 0, 0, 0, 0, 0, 0, 0, 0, 7, # 28, 1, 0, 0, 0, 0, 0, 0, 0, 8, # ('chanceAgreement:', 0.7929455607396227, 'ZeroR', 0.8075388026607538) # ('correct: ', 0.8044345898004435) # 0.05548796298143078 # >>> kappa(J48_Q11_Fall2019_assn2) # 1753, 21, 15, 4, 5, 2, 1, 6, 10, 4, # 74, 17, 4, 1, 2, 0, 4, 2, 0, 3, # 39, 8, 4, 1, 4, 2, 0, 0, 0, 0, # 24, 5, 3, 0, 3, 3, 1, 1, 2, 1, # 30, 2, 2, 1, 3, 0, 3, 1, 2, 0, # 19, 5, 3, 1, 0, 2, 2, 3, 3, 0, # 15, 5, 0, 3, 3, 4, 1, 1, 1, 1, # 18, 2, 0, 1, 1, 3, 3, 3, 2, 3, # 14, 2, 1, 1, 3, 1, 0, 5, 6, 4, # 18, 3, 0, 1, 1, 0, 0, 6, 4, 4, # ('chanceAgreement:', 0.7206151395519196, 'ZeroR', 0.8075388026607538) # ('correct: ', 0.7951219512195122) # 0.26668163603460054 # # Even though there are fewer correct instances for # J48 (79.5%) than for OneR (80.4%) in the above two confusion matrices, # J48's chanceAgreement of .721 is less than OneR's of .793, # so kappa goes up from OneR's .055 to J48's .267. # # 1. THE ABOVE J48 ERRORS ARE MORE DISTRIBUTED AND THUS LESS CORRELATED # (less "expected") THAN FOR OneR'S MATRIX. # # chanceAgreement += (sumRows[i] * sumColumns[i]) MULTIPLIES: # sumRows[i]: (instances classified as i, including those misclassified as i) # sumColumns[i]): (instances in bin i, including those not classified as i) # # Basically, it appears that Weka's Kappa looks at the types of errors # made. Those that are more consistent (correlated - they show a pattern # of errors) in OneR have a higher chanceAgreement than those that are # more scattered (less correlated); thus J48 has a much higher kappa # because its mistakes are less predictable. # # 2. ALSO, THE CORRECT PREDICTIONS ALONG THE DIAGONAL ARE MORE EVENLY DISTRIBUTED # FOR J48 THAN FOR OneR confusion matrices above, but I don't think that # is as important as observation #1. def parsonErrors(confusionMatrix, iswrap=False): ''' parsonErrors assumes the nominal bins in the square confusionMatrix parameter have a linear ordering from 0 through len(confusionMatrix)-1, as you would for a Discretized numeric attribute. It returns the 2-tuple (root mean-squared error, mean absolute error) in terms of distance of any classified instance in the confusionMatrix from its correct classification. When iswrap==True, it assumes wrap-around at the top bin, i.e. len(confusionMatrix)-1 is adjacent to 0; when iswrap==False, it uses simple linear ordering. ''' def sqr(n): return n * n sumdiffs = 0.0 sumsquares = 0.0 sumweights = 0.0 size = len(confusionMatrix) if (iswrap): for row in range(0, size): for col in range(0, size): if (row <= col): diff = min(col-row, row + size - col) else: diff = min(row-col, col + size - row) sumdiffs += diff * confusionMatrix[row][col] sumsquares += sqr(diff) * confusionMatrix[row][col] sumweights += confusionMatrix[row][col] else: for row in range(0, size): for col in range(0, size): sumdiffs += abs(row-col) * confusionMatrix[row][col] sumsquares += sqr(row-col) * confusionMatrix[row][col] sumweights += confusionMatrix[row][col] RMSE = math.sqrt(sumsquares / sumweights) MAE = (sumdiffs / sumweights) return (RMSE, MAE) # >>> parsonErrors(OneR_Q9_Fall2019_assn2) # (2.1184439756691185, 0.774279379157428) # >>> parsonErrors(J48_Q11_Fall2019_assn2) # (1.9238265637655882, 0.7024390243902439) # >>> parsonErrors(Perfect) # (0.0, 0.0) # >>> parsonErrors(TooMany_I1_TooFew_I1) # (1.2909944487358056, 0.7777777777777778) # >>> parsonErrors(TooMany_I1) # (1.2909944487358056, 0.7777777777777778) # >>> parsonErrors(TooFew_I1) # (1.2909944487358056, 0.7777777777777778) # >>> parsonErrors(Terrible) # (2.211083193570267, 2.0) # >>> parsonErrors(Scattered) # (1.6666666666666667, 1.0)