Back

Explore Courses Blog Tutorials Interview Questions
0 votes
1 view
in Machine Learning by (19k points)

I am using a bag of words to classify text. It's working well but I am wondering how to add a feature which is not a word.

Here is my sample code.

import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier

X_train = np.array(["new york is a hell of a town",

                    "new york was originally dutch",

                    "new york is also called the big apple",

                    "nyc is nice",

                    "the capital of great britain is london. london is a huge metropolis which has a great many number of people living in it. london is also a very old town with a rich and vibrant cultural history.",

                    "london is in the uk. they speak english there. london is a sprawling big city where it's super easy to get lost and i've got lost many times.",

                    "london is in england, which is a part of great britain. some cool things to check out in london are the museum and buckingham palace.",

                    "london is in great britain. it rains a lot in britain and london's fogs are a constant theme in books based in london, such as sherlock holmes. the weather is really bad there.",])

y_train = [[0],[0],[0],[0],[1],[1],[1],[1]]

X_test = np.array(["it's a nice day in nyc",

                   'i loved the time i spent in london, the weather was great, though there was a nip in the air and i had to wear a jacket.'

                   ])   

target_names = ['Class 1', 'Class 2']

classifier = Pipeline([

    ('vectorizer', CountVectorizer(min_df=1,max_df=2)),

    ('tfidf', TfidfTransformer()),

    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)

for item, labels in zip(X_test, predicted):

    print '%s => %s' % (item, ', '.join(target_names[x] for x in labels))

It is clear that the text about London tends to be much longer than the text about New York. How would I add the length of the text as a feature? Do I have to use another way of classification and then combine the two predictions? Is there any way of doing it along with the bag of words? Some sample code would be great -- I'm very new to machine learning and scikit learn.

1 Answer

0 votes
by (33.1k points)

Below here is a combination of a FunctionTransformer, a FeaturePipeline and a FeatureUnion.

import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import FunctionTransformer

X_train = np.array(["new york is a hell of a town",

                    "new york was originally dutch",

                    "new york is also called the big apple",

                    "nyc is nice",

                    "the capital of great britain is london. london is a huge metropolis which has a great many number of people living in it. london is also a very old town with a rich and vibrant cultural history.",

                    "london is in the uk. they speak english there. london is a sprawling big city where it's super easy to get lost and i've got lost many times.",

                    "london is in england, which is a part of great britain. some cool things to check out in london are the museum and buckingham palace.",

                    "london is in great britain. it rains a lot in britain and london's fogs are a constant theme in books based in london, such as sherlock holmes. the weather is really bad there.",])

y_train = np.array([[0],[0],[0],[0],[1],[1],[1],[1]])

X_test = np.array(["it's a nice day in nyc",

                   'i loved the time i spent in london, the weather was great, though there was a nip in the air and i had to wear a jacket.'

                   ])   

target_names = ['Class 1', 'Class 2']

def get_text_length(x):

    return np.array([len(t) for t in x]).reshape(-1, 1)

classifier = Pipeline([

    ('features', FeatureUnion([

        ('text', Pipeline([

            ('vectorizer', CountVectorizer(min_df=1,max_df=2)),

            ('tfidf', TfidfTransformer()),

        ])),

        ('length', Pipeline([

            ('count', FunctionTransformer(get_text_length, validate=False)),

        ]))

    ])),

    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)

predicted

This will add the length of the text to the features used by the classifier.

Hope this answer helps you!

Browse Categories

...