Text Classification

Basic Steps:

  1. Data Cleaning

  2. Import Necessary Libraries

  3. Read the dataset

  4. Visualize the dataset

  5. Find the trend and relation in between features

  6. Convert text into numerical features

  7. Display sparse matrix

  8. Split dataset into training and testing set

  9. Perform learning opertion - fit

  10. Predict accuracy -Score

  11. Validate result - Confusion Matrix, Classification Report

  12. Repeat the process - Till the desirable validation result

In [70]:
from IPython.core.display import Image 
Image(filename='C://Users//datta//Pictures//program_flow.jpg')
Out[70]:
In [11]:
# Import All required packages
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt4agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from matplotlib.axes import Subplot
%matplotlib inline
import pandas as pd
import numpy as np
from numpy.random import randn
from scipy import stats 
import requests
import seaborn as sns
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
import warnings

warnings.filterwarnings("ignore")
In [71]:
from ipywidgets import interactive
from IPython.display import Audio, display
import ipywidgets as widgets
from IPython.display import display, clear_output, Javascript
from traitlets import Unicode

# nbconvert related imports
from nbconvert import get_export_names, export_by_name
from nbconvert.writers import FilesWriter
from nbformat import read, NO_CONVERT
from nbconvert.utils.exceptions import ConversionException
In [72]:
notebook_name = widgets.Text()
In [73]:
js = """IPython.notebook.kernel.widget_manager.get_model('%s').then(function(model) {
      model.set('value', IPython.notebook.notebook_name);
    model.save();
});
""" % notebook_name.model_id
display(Javascript(data=js))
In [74]:
filename = notebook_name.value
filename
Out[74]:
'Text Classification_MNB_smaller_version.ipynb'
In [75]:
exporter_names = widgets.Dropdown(options=get_export_names(), value='html')
export_button = widgets.Button(description="Export")
download_link = widgets.HTML(visible=False)
In [17]:
taws_df = pd.read_excel('just_3.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
In [18]:
#smaller version
taws_df.count()
Out[18]:
CAUSE_LEVEL_1                30248
Description                  30248
Resolution                   23133
Corrective_Action_lessons    27446
General_Catigory             30248
LOCATION                     30248
Merge                        30248
dtype: int64
In [19]:
transformer = TfidfTransformer(smooth_idf=False)
transformer 
Out[19]:
TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)
In [20]:
corpus = taws_df['Merge']
In [21]:
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()
print (X.shape)
(30248, 30382)
In [22]:
X
Out[22]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [23]:
vectorizer.get_feature_names()
Out[23]:
['00',
 '000',
 '0000',
 '0000h',
 '0000hrs',
 '0005',
 '0006',
 '0008',
 '000lb',
 '001',
 '0010',
 '00100lt',
 '0015',
 '0015lt',
 '002',
 '0020',
 '0025',
 '00250lt',
 '0029',
 '003',
 '0030',
 '0035',
 '004',
 '0042lt',
 '0045',
 '0045hrs',
 '005',
 '005s',
 '006',
 '007',
 '009',
 '009591959',
 '00am',
 '00h18',
 '00lt',
 '00m',
 '01',
 '010',
 '0100',
 '0100hrs',
 '0100lt',
 '0105',
 '0105h',
 '011',
 '0110hrs',
 '0115',
 '0118',
 '011vwd',
 '012',
 '0125lt',
 '0130',
 '0130hr',
 '013vwd',
 '014',
 '0145',
 '015',
 '0150',
 '016',
 '0169',
 '017',
 '019',
 '01and',
 '01chk',
 '01fig',
 '01no',
 '01st',
 '02',
 '0200',
 '0200hrs',
 '021',
 '0210',
 '0212',
 '0215lt',
 '022',
 '0220',
 '0221',
 '0224',
 '0228',
 '022vcp',
 '0230',
 '0230hrs',
 '0234',
 '0237',
 '023as',
 '024',
 '0240',
 '0242',
 '0248',
 '025',
 '0250',
 '0255',
 '027â',
 '02chk',
 '02fig',
 '02frm',
 '03',
 '030',
 '0300',
 '0300lt',
 '0308',
 '031',
 '0310',
 '0313',
 '0315lt',
 '0318lt',
 '032',
 '0327hrs',
 '033',
 '0330',
 '0330h',
 '0335',
 '0338lt',
 '034',
 '0340h',
 '0345h',
 '037',
 '037225',
 '039',
 '03a',
 '03lt',
 '03th',
 '03â',
 '04',
 '0400',
 '0400am',
 '0400h',
 '0400hrs',
 '0400lt',
 '0405',
 '0406',
 '0406lt',
 '040deg',
 '041',
 '0410h',
 '0410lt',
 '0412h',
 '0415',
 '0420hrs',
 '0420lt',
 '0425lt',
 '0429',
 '043',
 '0430',
 '0430hrs',
 '0435',
 '0440',
 '0445lt',
 '0446',
 '044e57',
 '045',
 '0450',
 '0459',
 '0460',
 '048',
 '0483',
 '049',
 '04th',
 '05',
 '0500',
 '0500lt',
 '0509',
 '0510',
 '0512',
 '0515lt',
 '0518',
 '0519',
 '052',
 '0520lt',
 '0530',
 '0535',
 '054',
 '0545',
 '055',
 '0550',
 '055092557',
 '057',
 '0579',
 '059',
 '05min',
 '05th',
 '06',
 '060',
 '0600',
 '0600hrs',
 '0601',
 '0605hrs',
 '0607',
 '060782',
 '060882',
 '061',
 '0610',
 '0615',
 '0618',
 '0625',
 '063',
 '0630',
 '0635',
 '064',
 '064306',
 '0647',
 '0648',
 '065',
 '0655',
 '068',
 '06h41',
 '06h50',
 '06â',
 '07',
 '0700',
 '0710',
 '0712',
 '0715',
 '0716',
 '072000',
 '072003',
 '0730',
 '073000',
 '0730hrs',
 '0732',
 '0733',
 '0734',
 '0735',
 '0740',
 '0745',
 '0745hrs',
 '079',
 '08',
 '0800',
 '0800h',
 '0800hrs',
 '0800lt',
 '080t',
 '0810',
 '081003',
 '0810lt',
 '0811',
 '0815',
 '0816',
 '0817',
 '082',
 '0824',
 '0830',
 '0830h',
 '0835',
 '0836',
 '0840',
 '0840h',
 '0840hrs',
 '0845',
 '085',
 '0850',
 '087',
 '088',
 '08h30',
 '08lt',
 '08nm',
 '08th',
 '09',
 '090',
 '0900',
 '0900h',
 '0900hrs',
 '0900l',
 '0900lt',
 '0903',
 '0905',
 '090deg',
 '0910',
 '091000',
 '091001',
 '091005',
 '0912',
 '0915',
 '0915hrs',
 '0918',
 '0924',
 '0925',
 '0929',
 '0930',
 '0930h',
 '0930lt',
 '0935',
 '0940',
 '0945',
 '0945hrs',
 '0948',
 '0949',
 '095',
 '0950',
 '0950lt',
 '0951',
 '0952',
 '0954',
 '0e',
 '0kg',
 '0m',
 '0mm',
 '0mt',
 '0mtrs',
 '0n',
 '0ne',
 '0nm',
 '0ppm',
 '0r',
 '0t',
 '0x',
 '10',
 '100',
 '1000',
 '10000',
 '1000h',
 '1000hrs',
 '1000lt',
 '1000rpm',
 '1002',
 '1003',
 '1005',
 '1005kw',
 '100982',
 '100bar',
 '100c',
 '100m',
 '100ppm',
 '101',
 '101082',
 '1013',
 '1015',
 '1015h',
 '1015hrs',
 '1016',
 '1016mb',
 '1018',
 '102',
 '1020',
 '1024',
 '1025',
 '103',
 '1030',
 '1030am',
 '1030h',
 '1030hrs',
 '1034',
 '1035',
 '1035hrs',
 '1036',
 '1045',
 '1045hrs',
 '104a',
 '105',
 '1050',
 '10500',
 '1053',
 '1055',
 '106',
 '1066',
 '107',
 '1078',
 '1089',
 '109',
 '10cm',
 '10deg',
 '10g',
 '10m',
 '10minutes',
 '10mm',
 '10mts',
 '10nm',
 '10ppm',
 '10t',
 '10utc',
 '10v',
 '10x8',
 '11',
 '110',
 '1100',
 '1100hrs',
 '1105h',
 '110m3',
 '110v',
 '110vac',
 '110volts',
 '111',
 '1115hrs',
 '1118lt',
 '1119',
 '112',
 '1120',
 '1121',
 '1127',
 '1127log',
 '113',
 '1130hrs',
 '1133',
 '1135',
 '1139',
 '114',
 '1140',
 '1144',
 '1147',
 '115',
 '1150',
 '1151',
 '1152',
 '115t',
 '116',
 '1162',
 '1182',
 '119',
 '11oo',
 '11pcs',
 '12',
 '120',
 '1200',
 '1200h',
 '1200hrs',
 '1200mmwg',
 '1205',
 '1206',
 '1207',
 '120ltrs',
 '120of',
 '120v',
 '120vac',
 '121',
 '1210',
 '1210lt',
 '1211',
 '1214',
 '1215',
 '1215hrs',
 '1225',
 '123',
 '1230',
 '1245',
 '1245hrs',
 '125',
 '1254',
 '1255',
 '127',
 '1273',
 '129',
 '12hrs',
 '12kg',
 '12lt',
 '12mins',
 '12mm',
 '12months',
 '12mt',
 '12mts',
 '12n24',
 '12nn',
 '12p',
 '12t',
 '12th',
 '12x4',
 '13',
 '130',
 '1300',
 '13000rpm',
 '1300hrs',
 '1300lt',
 '1305',
 '130c',
 '131',
 '1312hrs',
 '1313',
 '1314',
 '1315',
 '1315hrs',
 '1320',
 '1320h',
 '1325',
 '1325hrs',
 '133',
 '1330',
 '1330hrs',
 '1330lt',
 '1336hrs',
 '1340',
 '1344',
 '1345',
 '1345h',
 '1345hrs',
 '1345lt',
 '1346',
 '1348',
 '1349',
 '135',
 '1350',
 '1353',
 '1354',
 '1354hrs',
 '1355',
 '1357',
 '135l',
 '136',
 '13600rpm',
 '1366',
 '137',
 '1379',
 '138',
 '1383',
 '1385',
 '1386',
 '1389',
 '139',
 '1398',
 '13cm',
 '13th',
 '14',
 '140',
 '1400',
 '14001',
 '1400hrs',
 '1406',
 '140bar',
 '140c',
 '140t',
 '141',
 '1410h',
 '1418',
 '142',
 '1420',
 '1420h',
 '1422hrs',
 '1426hrs',
 '1428',
 '143',
 '1430',
 '1430l',
 '1430lt',
 '1435',
 '1435h',
 '1435hrs',
 '1442',
 '1442hrs',
 '1444',
 '1445',
 '1445lt',
 '1447hrs',
 '1448',
 '1448hrs',
 '145',
 '1450',
 '1455lt',
 '1459',
 '146',
 '147',
 '148',
 '14m',
 '14mmx8mm',
 '14th',
 '15',
 '150',
 '1500',
 '1500bar',
 '1500hrs',
 '1500lt',
 '150bars',
 '150cm',
 '150ltr',
 '150m',
 '150mm',
 '150v',
 '151',
 '1510h',
 '1510lt',
 '1515',
 '151g',
 '1524',
 '1525lt',
 '152g139',
 '1530',
 '1530h',
 '1530hrs',
 '1532',
 '1533',
 '1536',
 '1540',
 '1545',
 '1545lt',
 '154m',
 '155',
 '1550',
 '1552',
 '1553hrs',
 '15cm',
 '15hrs',
 '15kg',
 '15lt',
 '15m',
 '15nm',
 '15ppm',
 '15sec',
 '15th',
 '15yrs',
 '16',
 '160',
 '1600',
 '16000',
 '1600h',
 '1605',
 '1607',
 '1610',
 '1620',
 '1625',
 '1630',
 '1630h',
 '1630hrs',
 '1630lt',
 '164',
 '1640',
 '1642',
 '1645lt',
 '1650',
 '1653',
 '1655',
 '165â',
 '166',
 '167',
 '168',
 '169',
 '16a',
 '16kpa',
 '16m',
 '16mm',
 '16th',
 '16âº',
 '17',
 '170',
 '1700',
 '1700h',
 '1700hrs',
 '1706',
 '1706lt',
 '1710',
 '1712',
 '1715',
 '173',
 '1730',
 '1730lt',
 '1736',
 '173cm',
 '174',
 '175',
 '1755',
 '1757',
 '175â',
 '176',
 '177',
 '1773',
 '17a',
 '17oohrs',
 '17sb',
 '17t',
 '17th',
 '18',
 '180',
 '1800',
 '1800hrs',
 '1800lt',
 '1805',
 '1810',
 '1812',
 '1815',
 '1815hrs',
 '1820',
 '1825',
 '1826',
 '183',
 '1830',
 '1830hrs',
 '1833',
 '1836hrs',
 '1838hrs',
 '1840',
 '1842',
 '1845',
 '1848',
 '185',
 '1850',
 '1850hrs',
 '1875',
 '188',
 '18h42',
 '18m',
 '18th',
 '19',
 '190',
 '1900',
 '1900hrs',
 '190111',
 '190112',
 '1905',
 '1909',
 '1910h',
 '1915',
 '192',
 '1920h',
 '1922',
 '1923',
 '1928',
 '1930',
 '1930hrs',
 '1932',
 '1942',
 '1945',
 '1950',
 '1950h',
 '1952',
 '1974',
 '199',
 '1993',
 '19th',
 '1a',
 '1ae',
 '1assit',
 '1c',
 '1cable',
 '1ch',
 '1cooling',
 '1cub',
 '1day',
 '1db',
 '1dbwbt',
 '1ft',
 '1gen',
 '1h',
 '1hr',
 '1kg',
 '1ltrs',
 '1m',
 '1m3',
 '1meter',
 '1metre',
 '1mm',
 '1month',
 '1mtr',
 '1nm',
 '1no',
 '1o',
 '1oe',
 '1on',
 '1p',
 '1pc',
 '1s',
 '1spring',
 '1st',
 '1t',
 '1ton',
 '1was',
 '1x',
 '1x2',
 '1â',
 '1â½',
 '20',
 '200',
 '2000',
 '2000h',
 '2000hrs',
 '2000lt',
 '2000ppm',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '200l',
 '200litre',
 '200ltrs',
 '200m',
 '200mm',
 '200mtrs',
 '200ppm',
 '201',
 '2010',
 '2010hrs',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015hrs',
 '2015lt',
 '2019',
 '201a',
 '202',
 '2020',
 '203',
 '2030',
 '2030hrs',
 '2040',
 '2042',
 '205',
 '2055',
 '2059',
 '209',
 '20cm',
 '20deg',
 '20degs',
 '20ft',
 '20kg',
 '20knot',
 '20kv',
 '20l',
 '20litres',
 '20lt',
 '20ltr',
 '20ltrs',
 '20m',
 '20man',
 '20min',
 '20mins',
 '20mm',
 '20mtrs',
 '20nm',
 '20seconds',
 '20th',
 '20tonnes',
 '20â',
 '20âº',
 '21',
 '210',
 '2100',
 '2100hrs',
 '2100i',
 '2103',
 '2107',
 '210deg',
 '2115',
 '2115lt',
 '2118',
 '212',
 '2120',
 '2122',
 '2124',
 '213',
 '2130',
 '2135h',
 '2138',
 '2139',
 '214',
 '2140lt',
 '2142',
 '2145hrs',
 '2148',
 '2154',
 '2159',
 '215deg',
 '216',
 '218',
 '21st',
 '21x29x50',
 '21x29x50cm',
 '22',
 '220',
 '2200',
 '22000',
 '2200hrs',
 '2200hts',
 '2200lt',
 '2201',
 '220v',
 '220vac',
 '220volts',
 '2210',
 '2210lt',
 '2211',
 '2215',
 '2220hrs',
 '223',
 '2230',
 '2230lt',
 '224â',
 '2251',
 '2255',
 '2257',
 '226',
 '228',
 '229f',
 '229â',
 '22aug',
 '22nd',
 '23',
 '230',
 '2300',
 '2300hrs',
 '2300lt',
 '2305',
 '2308490',
 '230deg',
 '230f',
 '230v',
 '232',
 '2320',
 '2329',
 '2330',
 '2330lt',
 '2335',
 '2335hrs',
 '233b',
 '234',
 '235',
 '2350',
 '23529kw',
 '2354',
 '2371',
 '23a',
 '23b',
 '23d',
 '23e',
 '23i',
 '23lt',
 '23m',
 '23mpa',
 '23rd',
 '24',
 '240',
 '2400',
 '2416',
 '241f',
 '2421',
 '245',
 '2450',
 '2470',
 '248',
 '24hrs',
 '24lt',
 '24th',
 '24v',
 '24volt',
 '24volts',
 '25',
 '250',
 '2500',
 '250mt',
 '250v',
 '254v',
 '255â',
 '257',
 '258',
 '25cub',
 '25dec',
 '25hp',
 '25kg',
 '25knots',
 '25kts',
 '25ltr',
 '25m',
 '25metres',
 '25nm',
 '25rpm',
 '25t',
 '25th',
 '26',
 '261',
 '261â',
 '267',
 '268',
 '26th',
 '26â',
 '27',
 '270',
 '2700',
 '270â',
 '2735',
 '275',
 '2782',
 '2790',
 '27f',
 '27g',
 '27m',
 '27rpm',
 '27th',
 '28',
 '280',
 '2800',
 '280nm',
 '281',
 '2810',
 '2845',
 '285',
 '288',
 '28th',
 '29',
 '290',
 '292',
 '29aug09',
 '29cfr',
 '29mar',
 '29th',
 '2a',
 '2ae',
 '2aux',
 '2cables',
 '2ct',
 '2de',
 '2deck',
 '2e',
 ...]
In [24]:
causes = taws_df["CAUSE_LEVEL_1"].unique()
cause_dict = {value:index for index, value in enumerate(causes)}
y = taws_df["CAUSE_LEVEL_1"].map(cause_dict)
cause_dict
Out[24]:
{'     nb      vcnb  ': 5,
 'ENVIRONMENT / WEATHER': 1,
 'EQUIPMENT': 2,
 'ORGANIZATION': 4,
 'PEOPLE': 0,
 'UNSPECIFIED': 3}
In [25]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)
In [26]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[26]:
0.78347107438016528
In [27]:
clf_dt = tree.DecisionTreeClassifier(max_depth=10)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)
Out[27]:
0.77669421487603307
In [28]:
y_test
Out[28]:
16543    0
20613    2
18600    2
6794     0
11416    0
14463    0
2905     0
13024    0
3078     0
8951     0
8552     0
6317     0
13168    0
6894     0
13856    0
8812     0
9250     0
3932     2
16353    0
20314    0
11105    0
6859     2
24919    2
2611     2
10683    0
12022    2
20952    2
17558    2
11991    0
26854    2
        ..
27557    1
25124    0
10219    0
13860    0
19152    4
8399     0
4758     0
27307    0
28275    0
28813    0
9055     0
4299     0
13396    0
5292     1
17231    0
22747    0
7715     0
9271     0
20913    0
26602    2
29489    0
12000    0
11975    0
16706    2
6351     2
14968    0
7554     0
5515     4
3848     0
9198     0
Name: CAUSE_LEVEL_1, Length: 6050, dtype: int64
In [29]:
y_pred = clf.predict(X_test)
In [35]:
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
In [36]:
cm = confusion_matrix(y_test, y_pred)
cm
Out[36]:
array([[3985,    5,  552,    0,   98],
       [  77,    9,   81,    0,    5],
       [ 310,    1,  677,    0,   25],
       [   6,    1,    8,    1,    1],
       [  85,    1,   54,    0,   68]])
In [39]:
plt.matshow(cm)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [40]:
target_names = ['PEOPLE', 'ENVIRONMENT', 'EQUIPMENT', 'ORGANISATION', 'UNSPECIFIED']
In [43]:
print(classification_report(y_test, y_pred, target_names=target_names))
              precision    recall  f1-score   support

      PEOPLE       0.89      0.86      0.88      4640
 ENVIRONMENT       0.53      0.05      0.10       172
   EQUIPMENT       0.49      0.67      0.57      1013
ORGANISATION       1.00      0.06      0.11        17
 UNSPECIFIED       0.35      0.33      0.34       208

 avg / total       0.80      0.78      0.78      6050

In [44]:
test_classifier(clf)
Accuracy: 0.7717 (+/- 0.00)
In [45]:
sample_dataframe = taws_df.sample(n=200)
In [46]:
pwas_df = pd.read_excel('200ex.xlsx', 'Sheet1', index_col=None)
In [47]:
pwas_df.head()
Out[47]:
Description
0 J/Eng was checking the chemical racks, where h...
1 One stay wire connected to Christmas Tree loos...
2 I found out that Pilot combination ladder (P) ...
3 AT (Time). One crew member kept food in microw...
4 During loading operations in (Location) by shi...
In [48]:
n = 0
In [49]:
for n in range(200):
    pm = pwas_df.ix[n]
    vect_pm = vectorizer.transform(pm).toarray()
    m = clf.predict(vect_pm)
    print(m)  
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[2]
[0]
[0]
[2]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[4]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[2]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[2]
[4]
[0]
[0]
[2]
[2]
[2]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
In [50]:
cons_df = pd.read_excel('machine_vs_human.xlsx', 'Sheet1', index_col=None)
In [51]:
m_test = cons_df['humnan']
In [52]:
m_pred = cons_df['machine']
In [53]:
cm2 = confusion_matrix(m_test, m_pred)
In [54]:
cm2
Out[54]:
array([[152,   0,  10,   0,   0],
       [  4,   3,   0,   0,   0],
       [  4,   0,  18,   0,   0],
       [  0,   0,   1,   0,   0],
       [  4,   0,   2,   0,   2]])
In [55]:
plt.matshow(cm2)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [56]:
print(classification_report(m_test, m_pred, target_names=target_names))
              precision    recall  f1-score   support

      PEOPLE       0.93      0.94      0.93       162
 ENVIRONMENT       1.00      0.43      0.60         7
   EQUIPMENT       0.58      0.82      0.68        22
ORGANISATION       0.00      0.00      0.00         1
 UNSPECIFIED       1.00      0.25      0.40         8

 avg / total       0.89      0.88      0.87       200

This is very useful tool to share and convert the file into different file format. source

In [76]:
file_writer = FilesWriter()

def export(name, nb):
    
    # Get a unique key for the notebook and set it in the resources object.
    notebook_name = name[:name.rfind('.')]
    resources = {}
    resources['unique_key'] = notebook_name
    resources['output_files_dir'] = '%s_files' % notebook_name

    # Try to export
    try:
        output, resources = export_by_name(exporter_names.value, nb)
    except ConversionException as e:
        download_link.value = "<br>Could not export notebook!"
    else:
        write_results = file_writer.write(output, resources, notebook_name=notebook_name)
    
        download_link.value = "<br>Results: <a href='files/{filename}'><i>\"{filename}\"</i></a>".format(filename=write_results)
        download_link.visible = True
        
def handle_export(widget):
    with open(filename, 'r') as f:
        export(filename, read(f, NO_CONVERT))
        
export_button.on_click(handle_export)
In [77]:
display(exporter_names, export_button, download_link)
In [ ]:
 
Skype Backpack