Feature extraction, selection and predictive modeling with Scikit

Feature extraction, selection and predictive modeling with Scikit

Table of content

Feature extraction

Feature selection

Predictive modeling

Empirical evaluation

Experimental settings

Results

Codes

Python code that is used to perform the experiments are listed in the following code block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
feature selection with scikit learn
copyright @ Hongyu Su (hongyu.su@me.com)
"""

from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
import itertools


# collect data for training and test
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
remove = ('headers','footers','quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove)
y_train,y_test = data_train.target, data_test.target


vectorizationModels = (
    (HashingVectorizer(stop_words='english', non_negative=True,n_features=2**16),'Hashing'),
    (TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'),'Tfidf'))

rfesvc = SVC(kernel="linear", C=1)
featureSelectionModels = (
    (SelectKBest(chi2, k=5),'Univariance'),
    (LinearSVC(C=0.1, penalty="l1", dual=False),'L1'),
    (ExtraTreesClassifier(),'Tree'),
    (RFE(estimator=rfesvc, n_features_to_select=10000, step=1),'RFE'))

classificationModels = (
    (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
    (Perceptron(n_iter=50), "Perceptron"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(n_estimators=100), "Random forest"),
    (LinearSVC(loss='squared_hinge', penalty='l1', dual=False, tol=1e-3),"SVC-L1"),
    (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3),"SVC-L2"),
    (SGDClassifier(alpha=.0001, n_iter=50,penalty='l1'),'SGD-L1'),
    (SGDClassifier(alpha=.0001, n_iter=50,penalty='l2'),'SGD-L2'),
    (SGDClassifier(alpha=.0001, n_iter=50,penalty='elasticnet'),'SGD-ElasticNet'),
    (NearestCentroid(),'Nearest neighbor'),
    (MultinomialNB(alpha=.01),'NB1'),
    (BernoulliNB(alpha=.01),'NB2'))

def benchmark(classifier):
    classifier.fit(X_train,y_train)
    pred = classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)    
    return score

results = []
for (vectorizer,vectorizerName),(selector,selectorName),(classifier,classifierName) in list(itertools.product(vectorizationModels,featureSelectionModels,classificationModels)):
    # vectorize input
    if vectorizerName=='Hashing':
        X_train = vectorizer.transform(data_train.data)
    else:
        X_train = vectorizer.fit_transform(data_train.data)
    X_test = vectorizer.transform(data_test.data)
    print(X_train.shape)
    # feature selection
    X_train = selector.fit_transform(X_train,y_train)
    X_test = selector.transform(X_test)
    # classification model
    results.append((vectorizerName,selectorName,classifierName,benchmark(classifier)))


for x in results:
    print(x)
Hongyu Su 12 October 2015