loadData = lambda f: np.genfromtxt(open(f,'r'), delimiter=' ')

print "loading data.."

traindata = list(np.array(p.read_table('train.tsv'))[:,2])#Reading WebsiteText column for TF-IDF.

testdata = list(np.array(p.read_table('test.tsv'))[:,2])

y = np.array(p.read_table('train.tsv'))[:,-1] #reading label

tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',

token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1,sublinear_tf=1)

rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)

X_all = traindata + testdata

lentrain = len(traindata)

print "fitting pipeline"

tfv.fit(X_all)

print "transforming data"

X_all = tfv.transform(X_all)

X = X_all[:lentrain]

X_test = X_all[lentrain:]

print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(rd, X, y, cv=20, scoring='roc_auc'))

#Add Two Integer Columns

AlexaAndGoogleTrainData = list(np.array(p.read_table('train.tsv'))[2:,3])#Not sure if I am doing this correctly. Expecting it to contain AlexaRank and GooglePageRank columns.

AlexaAndGoogleTestData = list(np.array(p.read_table('test.tsv'))[2:,3])

AllAlexaAndGoogleInfo = AlexaAndGoogleTestData + AlexaAndGoogleTrainData

#Add two columns to X.

X = np.append(X, AllAlexaAndGoogleInfo, 1) #Think I have done this incorrectly.

print "training on full data"

rd.fit(X,y)

pred = rd.predict_proba(X_test)[:,1]

testfile = p.read_csv('test.tsv', sep="\t", na_values=['?'], index_col=1)

pred_df = p.DataFrame(pred, index=testfile.index, columns=['label'])

pred_df.to_csv('benchmark.csv')

print "submission file created.."`