CSC 523 -
Scripting for Data Science, Fall 2023, Monday 6-8:50 PM in Old
Main 158.
ipython
session for introducing classification.
Use
Firefox or try other non-Chrome
browser for these links. Chrome
has problems
In [32]: from sklearn.tree
import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
# Non-target attributes must be numeric,
None for missing values seems to work,
not strings.
In [33]: from arfflib_3_3 import kappa
In [34]: nontargettrain = [(value,
value+2) for value in range(0,20)]
In [35]: targettrain = ['odd' if ((value
& 1) == 1) else 'even'
...: for value
in [pair[0] for pair in nontargettrain]]
In [36]: nontargettest = [(value,
value+2) for value in range(11,31)]
In [37]: targettest = ['odd' if ((value
& 1) == 1) else 'even'
...: for value
in [pair[0] for pair in nontargettest]]
In [38]: from sklearn.metrics import
confusion_matrix
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
In [39]: model1 =
DecisionTreeClassifier(max_depth=5,
random_state=220223523)
In [40]: model1.fit(nontargettrain,
targettrain) # train the model
Out[40]:
DecisionTreeClassifier(max_depth=5,
random_state=220223523)
In [41]: p1 =
model1.predict(nontargettest)
In [42]: p1
Out[42]:
array(['odd', 'odd', 'odd', 'odd',
'odd', 'odd', 'odd', 'odd', 'odd',
'odd', 'odd', 'odd', 'odd', 'odd',
'odd', 'odd', 'odd', 'odd',
'odd', 'odd'], dtype='<U4')
In [43]: cm1 =
confusion_matrix(targettest, p1)
In [44]: cm1
Out[44]:
array([[ 0, 10],
[
0, 10]], dtype=int64)
In [45]: # ^^^ First row is "should
have been even", 2nd "should have been
odd"
In [46]: # ^^^ First column is "was
predicted even", 2nd "predicted odd"
In [47]: kappa(cm1) # pick biggest
category == 0, best above that is 1.0
Out[47]: (0.0, 0.5, 10, 0.5, 10)
In [48]: # 0.0 is the kappa, other field
correct/incorrect instances
In [49]: model2 =
DecisionTreeClassifier(random_state=220223523)
In [50]: model2.fit(nontargettrain,
targettrain) # train the model
Out[50]:
DecisionTreeClassifier(random_state=220223523)
In [51]: p2 =
model2.predict(nontargettest)
In [52]: p2
Out[52]:
array(['odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even', 'odd',
'odd', 'odd', 'odd', 'odd', 'odd',
'odd', 'odd', 'odd', 'odd',
'odd', 'odd'], dtype='<U4')
In [53]: cm2 =
confusion_matrix(targettest, p2)
In [54]: cm2
Out[54]:
array([[ 4, 6],
[
0, 10]], dtype=int64)
In [55]: kappa(cm2)
Out[55]: (0.3999999999999999, 0.7, 14,
0.3, 6)
In [56]: targettrain
Out[56]:
['even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd']
In [57]: nontargettrain
Out[57]:
[(0, 2),
(1, 3),
(2, 4),
(3, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 9),
(8, 10),
(9, 11),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
(14, 16),
(15, 17),
(16, 18),
(17, 19),
(18, 20),
(19, 21)]
In [58]: targettest
Out[58]:
['odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even',
'odd',
'even']
In [59]: nontargettest
Out[59]:
[(11, 13),
(12, 14),
(13, 15),
(14, 16),
(15, 17),
(16, 18),
(17, 19),
(18, 20),
(19, 21),
(20, 22),
(21, 23),
(22, 24),
(23, 25),
(24, 26),
(25, 27),
(26, 28),
(27, 29),
(28, 30),
(29, 31),
(30, 32)]
In [60]: p2
Out[60]:
array(['odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even', 'odd',
'odd', 'odd', 'odd', 'odd', 'odd',
'odd', 'odd', 'odd', 'odd',
'odd', 'odd'], dtype='<U4')
# It fell off the turnip truck
after (19, 21) in nontargettest
because
# (20, 22) ... (30, 32) are not
in the training data while
# (11, 13) ... (19, 21) are in
the training data.
# Derive some perfectly
predictive attributes ;-) , a.k.a.
"cheat"
In [64]: nontargettrain =
list(map(lambda pair : ((True,) if
((pair[0]&1) == 1)
...: else
(False,)) + pair, nontargettrain))
In [65]: nontargettrain
Out[65]:
[(False, 0, 2),
(True, 1, 3),
(False, 2, 4),
(True, 3, 5),
(False, 4, 6),
(True, 5, 7),
(False, 6, 8),
(True, 7, 9),
(False, 8, 10),
(True, 9, 11),
(False, 10, 12),
(True, 11, 13),
(False, 12, 14),
(True, 13, 15),
(False, 14, 16),
(True, 15, 17),
(False, 16, 18),
(True, 17, 19),
(False, 18, 20),
(True, 19, 21)]
In [66]: nontargettest = list(map(lambda
pair : ((True,) if ((pair[0]&1) ==
1)
...: else
(False,)) + pair, nontargettest))
In [67]: nontargettest
Out[67]:
[(True, 11, 13),
(False, 12, 14),
(True, 13, 15),
(False, 14, 16),
(True, 15, 17),
(False, 16, 18),
(True, 17, 19),
(False, 18, 20),
(True, 19, 21),
(False, 20, 22),
(True, 21, 23),
(False, 22, 24),
(True, 23, 25),
(False, 24, 26),
(True, 25, 27),
(False, 26, 28),
(True, 27, 29),
(False, 28, 30),
(True, 29, 31),
(False, 30, 32)]
In [68]: model2.fit(nontargettrain,
targettrain)
Out[68]:
DecisionTreeClassifier(random_state=220223523)
In [69]: p2 =
model2.predict(nontargettest)
In [70]: p2
Out[70]:
array(['odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even', 'odd',
'even', 'odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even',
'odd', 'even'], dtype='<U4')
In [71]: cm2 =
confusion_matrix(targettest, p2)
In [72]: cm2
Out[72]:
array([[10, 0],
[
0, 10]], dtype=int64)
In [73]: kappa(cm2)
Out[73]: (1.0, 1.0, 20, 0.0, 0)
In [74]: model1.fit(nontargettrain,
targettrain) # train the model
Out[74]:
DecisionTreeClassifier(max_depth=5,
random_state=220223523)
In [75]: p1 =
model1.predict(nontargettest)
In [76]: p1
Out[76]:
array(['odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even', 'odd',
'even', 'odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even',
'odd', 'even'], dtype='<U4')
In [77]: cm1 =
confusion_matrix(targettest, p1)
In [78]: cm1
Out[78]:
array([[10, 0],
[
0, 10]], dtype=int64)
In [79]: kappa(cm1)
Out[79]: (1.0, 1.0, 20, 0.0, 0)
In [80]: from sklearn.dummy import
DummyClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# ZeroR in Weka, uses the statistical
mode of target value distribution
In [81]: model1 = DummyClassifier()
In [82]: model1.fit(nontargettrain,
targettrain) # train the model
Out[82]: DummyClassifier()
In [83]: p1 =
model1.predict(nontargettest)
In [84]: p1
Out[84]:
array(['even', 'even', 'even', 'even',
'even', 'even', 'even', 'even',
'even', 'even', 'even', 'even', 'even',
'even', 'even', 'even',
'even', 'even', 'even', 'even'],
dtype='<U4')
In [85]: cm1 =
confusion_matrix(targettest, p1)
In [86]: cm1
Out[86]:
array([[10, 0],
[10, 0]], dtype=int64)
In [87]: kappa(cm1)
Out[87]: (0.0, 0.5, 10, 0.5, 10)
In [88]: nontargettest = [triplet[1:]
for triplet in nontargettest]
In [89]: nontargettest
Out[89]:
[(11, 13),
(12, 14),
(13, 15),
(14, 16),
(15, 17),
(16, 18),
(17, 19),
(18, 20),
(19, 21),
(20, 22),
(21, 23),
(22, 24),
(23, 25),
(24, 26),
(25, 27),
(26, 28),
(27, 29),
(28, 30),
(29, 31),
(30, 32)]
In [90]: nontargettrain = [triplet[1:]
for triplet in nontargettrain]
In [91]: nontargettrain
Out[91]:
[(0, 2),
(1, 3),
(2, 4),
(3, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 9),
(8, 10),
(9, 11),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
(14, 16),
(15, 17),
(16, 18),
(17, 19),
(18, 20),
(19, 21)]
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
In [103]: nontargettrain =
list(map(lambda pair : ((pair[0] &
7),) + pair, nontargettrain))
In [104]: nontargettrain
Out[104]:
[(0, 0, 2),
(1, 1, 3),
(2, 2, 4),
(3, 3, 5),
(4, 4, 6),
(5, 5, 7),
(6, 6, 8),
(7, 7, 9),
(0, 8, 10),
(1, 9, 11),
(2, 10, 12),
(3, 11, 13),
(4, 12, 14),
(5, 13, 15),
(6, 14, 16),
(7, 15, 17),
(0, 16, 18),
(1, 17, 19),
(2, 18, 20),
(3, 19, 21)]
In [105]: nontargettest =
list(map(lambda pair : ((pair[0] &
7),) + pair, nontargettest))
# Just dump the upper bits, slightly
less domain expertise that (value &
1).
In [106]: nontargettest
Out[106]:
[(3, 11, 13),
(4, 12, 14),
(5, 13, 15),
(6, 14, 16),
(7, 15, 17),
(0, 16, 18),
(1, 17, 19),
(2, 18, 20),
(3, 19, 21),
(4, 20, 22),
(5, 21, 23),
(6, 22, 24),
(7, 23, 25),
(0, 24, 26),
(1, 25, 27),
(2, 26, 28),
(3, 27, 29),
(4, 28, 30),
(5, 29, 31),
(6, 30, 32)]
In [107]: model1 =
DecisionTreeClassifier(max_depth=5,
random_state=220223523)
In [108]: model1.fit(nontargettrain,
targettrain) # train the model
Out[108]:
DecisionTreeClassifier(max_depth=5,
random_state=220223523)
In [109]: p1 =
model1.predict(nontargettest)
In [110]: p1
Out[110]:
array(['odd', 'even', 'odd', 'odd',
'odd', 'even', 'odd', 'even', 'odd',
'even', 'odd', 'odd', 'odd', 'even',
'odd', 'even', 'odd', 'even',
'odd', 'odd'], dtype='<U4')
In [111]: cm1 =
confusion_matrix(targettest, p1)
In [112]: cm1
Out[112]:
array([[ 7, 3],
[
0, 10]], dtype=int64)
In [113]: kappa(cm1)
Out[113]: (0.7, 0.85, 17, 0.15, 3)
In [114]: model1 =
DecisionTreeClassifier(random_state=220223523)
In [115]: model1.fit(nontargettrain,
targettrain) # train the model
Out[115]:
DecisionTreeClassifier(random_state=220223523)
In [116]: p1 =
model1.predict(nontargettest)
In [117]: p1
Out[117]:
array(['odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even', 'odd',
'even', 'odd', 'even', 'odd', 'even',
'odd', 'even', 'odd', 'even',
'odd', 'even'], dtype='<U4')
In [118]: cm1 =
confusion_matrix(targettest, p1)
In [119]: cm1
Out[119]:
array([[10, 0],
[
0, 10]], dtype=int64)
In [120]: kappa(cm1)
Out[120]: (1.0, 1.0, 20, 0.0, 0)
In [121]: model1.tree_
Out[121]: <sklearn.tree._tree.Tree at
0x26071c46d40>
In [122]: model1.tree_.node_count #
count of tree nodes
Out[122]: 15