from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
File_1 = ('aaa', 'xyz', 'cccc', 'dddd', 'aaa')
File_2 = ('abc', 'aaa')
v = DictVectorizer()
# discover corpus and vectorize file word frequencies in a single pass
X = v.fit_transform(Counter(f) for f in (File_1, File_2))
# or, if you have a pre-defined corpus and/or would like to restrict the words you consider
# in your matrix, you can do
# Corpus = ('aaa', 'bbb', 'cccc', 'dddd', 'xyz')
# v.fit([OrderedDict.fromkeys(Corpus, 1)])
# X = v.transform(Counter(f) for f in (File_1, File_2))
# X is a sparse matrix, but you can access the A property to get a dense numpy.ndarray
# representation
print(X)
print(X.A)
<2x5 sparse matrix of type '<type 'numpy.float64'>'
array([[ 2., 0., 1., 1., 1.],
[ 1., 1., 0., 0., 0.]])