CSC 523 - Scripting for Data Science, Fall 2023, Monday 6-8:50 PM in Old Main 158.
ipython session for introducing classification.
 

Use Firefox or try other non-Chrome browser for these links. Chrome has problems 

In [32]: from sklearn.tree import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
# Non-target attributes must be numeric, None for missing values seems to work, not strings.

In [33]: from arfflib_3_3 import kappa

In [34]: nontargettrain = [(value, value+2) for value in range(0,20)]

In [35]: targettrain = ['odd' if ((value & 1) == 1) else 'even'
    ...:     for value in [pair[0] for pair in nontargettrain]]

In [36]: nontargettest = [(value, value+2) for value in range(11,31)]

In [37]: targettest = ['odd' if ((value & 1) == 1) else 'even'
    ...:     for value in [pair[0] for pair in nontargettest]]

In [38]: from sklearn.metrics import confusion_matrix
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [39]: model1 = DecisionTreeClassifier(max_depth=5, random_state=220223523)

In [40]: model1.fit(nontargettrain, targettrain) # train the model
Out[40]: DecisionTreeClassifier(max_depth=5, random_state=220223523)

In [41]: p1 = model1.predict(nontargettest)

In [42]: p1
Out[42]:
array(['odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd',
       'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd',
       'odd', 'odd'], dtype='<U4')

In [43]: cm1 = confusion_matrix(targettest, p1)

In [44]: cm1
Out[44]:
array([[ 0, 10],
       [ 0, 10]], dtype=int64)

In [45]: # ^^^ First row is "should have been even", 2nd "should have been odd"

In [46]: # ^^^ First column is "was predicted even", 2nd "predicted odd"

In [47]: kappa(cm1) # pick biggest category == 0, best above that is 1.0
Out[47]: (0.0, 0.5, 10, 0.5, 10)

In [48]: # 0.0 is the kappa, other field correct/incorrect instances

In [49]: model2 = DecisionTreeClassifier(random_state=220223523)

In [50]: model2.fit(nontargettrain, targettrain) # train the model
Out[50]: DecisionTreeClassifier(random_state=220223523)

In [51]: p2 = model2.predict(nontargettest)

In [52]: p2
Out[52]:
array(['odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd',
       'odd', 'odd'], dtype='<U4')

In [53]: cm2 = confusion_matrix(targettest, p2)

In [54]: cm2
Out[54]:
array([[ 4,  6],
       [ 0, 10]], dtype=int64)

In [55]: kappa(cm2)
Out[55]: (0.3999999999999999, 0.7, 14, 0.3, 6)

In [56]: targettrain
Out[56]:
['even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd']

In [57]: nontargettrain
Out[57]:
[(0, 2),
 (1, 3),
 (2, 4),
 (3, 5),
 (4, 6),
 (5, 7),
 (6, 8),
 (7, 9),
 (8, 10),
 (9, 11),
 (10, 12),
 (11, 13),
 (12, 14),
 (13, 15),
 (14, 16),
 (15, 17),
 (16, 18),
 (17, 19),
 (18, 20),
 (19, 21)]

In [58]: targettest
Out[58]:
['odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even',
 'odd',
 'even']

In [59]: nontargettest
Out[59]:
[(11, 13),
 (12, 14),
 (13, 15),
 (14, 16),
 (15, 17),
 (16, 18),
 (17, 19),
 (18, 20),
 (19, 21),
 (20, 22),
 (21, 23),
 (22, 24),
 (23, 25),
 (24, 26),
 (25, 27),
 (26, 28),
 (27, 29),
 (28, 30),
 (29, 31),
 (30, 32)]

In [60]: p2
Out[60]:
array(['odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd', 'odd',
       'odd', 'odd'], dtype='<U4')

# It fell off the turnip truck after  (19, 21) in nontargettest because
# (20, 22) ... (30, 32) are not in the training data while
# (11, 13) ... (19, 21) are in the training data.
# Derive some perfectly predictive attributes ;-) , a.k.a. "cheat"

In [64]: nontargettrain = list(map(lambda pair : ((True,) if ((pair[0]&1) == 1)
    ...:     else (False,)) + pair, nontargettrain))

In [65]: nontargettrain
Out[65]:
[(False, 0, 2),
 (True, 1, 3),
 (False, 2, 4),
 (True, 3, 5),
 (False, 4, 6),
 (True, 5, 7),
 (False, 6, 8),
 (True, 7, 9),
 (False, 8, 10),
 (True, 9, 11),
 (False, 10, 12),
 (True, 11, 13),
 (False, 12, 14),
 (True, 13, 15),
 (False, 14, 16),
 (True, 15, 17),
 (False, 16, 18),
 (True, 17, 19),
 (False, 18, 20),
 (True, 19, 21)]

In [66]: nontargettest = list(map(lambda pair : ((True,) if ((pair[0]&1) == 1)
    ...:     else (False,)) + pair, nontargettest))

In [67]: nontargettest
Out[67]:
[(True, 11, 13),
 (False, 12, 14),
 (True, 13, 15),
 (False, 14, 16),
 (True, 15, 17),
 (False, 16, 18),
 (True, 17, 19),
 (False, 18, 20),
 (True, 19, 21),
 (False, 20, 22),
 (True, 21, 23),
 (False, 22, 24),
 (True, 23, 25),
 (False, 24, 26),
 (True, 25, 27),
 (False, 26, 28),
 (True, 27, 29),
 (False, 28, 30),
 (True, 29, 31),
 (False, 30, 32)]

In [68]: model2.fit(nontargettrain, targettrain)
Out[68]: DecisionTreeClassifier(random_state=220223523)

In [69]: p2 = model2.predict(nontargettest)

In [70]: p2
Out[70]:
array(['odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even'], dtype='<U4')

In [71]: cm2 = confusion_matrix(targettest, p2)

In [72]: cm2
Out[72]:
array([[10,  0],
       [ 0, 10]], dtype=int64)

In [73]: kappa(cm2)
Out[73]: (1.0, 1.0, 20, 0.0, 0)

In [74]: model1.fit(nontargettrain, targettrain) # train the model
Out[74]: DecisionTreeClassifier(max_depth=5, random_state=220223523)

In [75]: p1 = model1.predict(nontargettest)

In [76]: p1
Out[76]:
array(['odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even'], dtype='<U4')

In [77]: cm1 = confusion_matrix(targettest, p1)

In [78]: cm1
Out[78]:
array([[10,  0],
       [ 0, 10]], dtype=int64)

In [79]: kappa(cm1)
Out[79]: (1.0, 1.0, 20, 0.0, 0)

In [80]: from sklearn.dummy import DummyClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# ZeroR in Weka, uses the statistical mode of target value distribution

In [81]: model1 = DummyClassifier()

In [82]: model1.fit(nontargettrain, targettrain) # train the model
Out[82]: DummyClassifier()

In [83]: p1 = model1.predict(nontargettest)

In [84]: p1
Out[84]:
array(['even', 'even', 'even', 'even', 'even', 'even', 'even', 'even',
       'even', 'even', 'even', 'even', 'even', 'even', 'even', 'even',
       'even', 'even', 'even', 'even'], dtype='<U4')

In [85]: cm1 = confusion_matrix(targettest, p1)

In [86]: cm1
Out[86]:
array([[10,  0],
       [10,  0]], dtype=int64)

In [87]: kappa(cm1)
Out[87]: (0.0, 0.5, 10, 0.5, 10)

In [88]: nontargettest = [triplet[1:] for triplet in nontargettest]

In [89]: nontargettest
Out[89]:
[(11, 13),
 (12, 14),
 (13, 15),
 (14, 16),
 (15, 17),
 (16, 18),
 (17, 19),
 (18, 20),
 (19, 21),
 (20, 22),
 (21, 23),
 (22, 24),
 (23, 25),
 (24, 26),
 (25, 27),
 (26, 28),
 (27, 29),
 (28, 30),
 (29, 31),
 (30, 32)]

In [90]: nontargettrain = [triplet[1:] for triplet in nontargettrain]

In [91]: nontargettrain
Out[91]:
[(0, 2),
 (1, 3),
 (2, 4),
 (3, 5),
 (4, 6),
 (5, 7),
 (6, 8),
 (7, 9),
 (8, 10),
 (9, 11),
 (10, 12),
 (11, 13),
 (12, 14),
 (13, 15),
 (14, 16),
 (15, 17),
 (16, 18),
 (17, 19),
 (18, 20),
 (19, 21)]

# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [103]: nontargettrain = list(map(lambda pair : ((pair[0] & 7),) + pair, nontargettrain))

In [104]: nontargettrain
Out[104]:
[(0, 0, 2),
 (1, 1, 3),
 (2, 2, 4),
 (3, 3, 5),
 (4, 4, 6),
 (5, 5, 7),
 (6, 6, 8),
 (7, 7, 9),
 (0, 8, 10),
 (1, 9, 11),
 (2, 10, 12),
 (3, 11, 13),
 (4, 12, 14),
 (5, 13, 15),
 (6, 14, 16),
 (7, 15, 17),
 (0, 16, 18),
 (1, 17, 19),
 (2, 18, 20),
 (3, 19, 21)]

In [105]: nontargettest = list(map(lambda pair : ((pair[0] & 7),) + pair, nontargettest))
# Just dump the upper bits, slightly less domain expertise that (value & 1).

In [106]: nontargettest
Out[106]:
[(3, 11, 13),
 (4, 12, 14),
 (5, 13, 15),
 (6, 14, 16),
 (7, 15, 17),
 (0, 16, 18),
 (1, 17, 19),
 (2, 18, 20),
 (3, 19, 21),
 (4, 20, 22),
 (5, 21, 23),
 (6, 22, 24),
 (7, 23, 25),
 (0, 24, 26),
 (1, 25, 27),
 (2, 26, 28),
 (3, 27, 29),
 (4, 28, 30),
 (5, 29, 31),
 (6, 30, 32)]

In [107]: model1 = DecisionTreeClassifier(max_depth=5, random_state=220223523)

In [108]: model1.fit(nontargettrain, targettrain) # train the model
Out[108]: DecisionTreeClassifier(max_depth=5, random_state=220223523)

In [109]: p1 = model1.predict(nontargettest)

In [110]: p1
Out[110]:
array(['odd', 'even', 'odd', 'odd', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd', 'odd', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'odd'], dtype='<U4')

In [111]: cm1 = confusion_matrix(targettest, p1)

In [112]: cm1
Out[112]:
array([[ 7,  3],
       [ 0, 10]], dtype=int64)

In [113]: kappa(cm1)
Out[113]: (0.7, 0.85, 17, 0.15, 3)

In [114]: model1 = DecisionTreeClassifier(random_state=220223523)

In [115]: model1.fit(nontargettrain, targettrain) # train the model
Out[115]: DecisionTreeClassifier(random_state=220223523)

In [116]: p1 = model1.predict(nontargettest)

In [117]: p1
Out[117]:
array(['odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even'], dtype='<U4')

In [118]: cm1 = confusion_matrix(targettest, p1)

In [119]: cm1
Out[119]:
array([[10,  0],
       [ 0, 10]], dtype=int64)

In [120]: kappa(cm1)
Out[120]: (1.0, 1.0, 20, 0.0, 0)

In [121]: model1.tree_
Out[121]: <sklearn.tree._tree.Tree at 0x26071c46d40>

In [122]: model1.tree_.node_count # count of tree nodes
Out[122]: 15