# Get the latest version of scikit-learn (library for machine learning)
! pip install --upgrade scikit-learn

Requirement already satisfied: scikit-learn in c:\users\colto\anaconda3\lib\site-packages (1.3.2)
Requirement already satisfied: scipy>=1.5.0 in c:\users\colto\anaconda3\lib\site-packages (from scikit-learn) (1.7.3)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\colto\anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: numpy<2.0,>=1.17.3 in c:\users\colto\anaconda3\lib\site-packages (from scikit-learn) (1.21.5)
Requirement already satisfied: joblib>=1.1.1 in c:\users\colto\anaconda3\lib\site-packages (from scikit-learn) (1.3.2)


# Check the scikit-learn version. It should be 1.3.2.
!pip show scikit-learn

Name: scikit-learn
Version: 1.3.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: c:\users\colto\anaconda3\lib\site-packages
Requires: threadpoolctl, scipy, numpy, joblib
Required-by: scikit-learn-intelex


import numpy as np # performing calculations on arrays
import pandas as pd # dealing with dataframe

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# For tokenizing text and build a machine learning model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


# Load the data
trump = pd.read_csv("Data/trump.csv")
trump


# Convert texts into a matrix of word counts
vectorizer = CountVectorizer(max_features = 500) # Select the top 500 words
sparse_matrix = vectorizer.fit_transform(trump['text']) # Fit and transform the original text into a matrix
termdoc = pd.DataFrame(sparse_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# Convert the matrix to dataframe, using words as column names
termdoc


# We want to begin with predicting what did Trump use to tweet. Let's see what "statusSource" looks like
trump.groupby('statusSource')['id'].count()

statusSource
<a href="http://instagram.com" rel="nofollow">Instagram</a>                               1
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                      120
<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>         1
<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>    762
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>      628
Name: id, dtype: int64


# Converting labels to numbers
def sourcestring_to_integer(sourcestring):
    if 'android' in sourcestring:
        return 0
    elif 'iphone' in sourcestring:
        return 1
    else:
        return 2


trump['statusSource']= trump['statusSource'].map(sourcestring_to_integer) # map statusSource to 0, 1 or 2
trump = trump[(trump['statusSource']==0) | (trump['statusSource']==1)] # keep 0 and 1 only (Android or iPhone)
trump


# First we have to shuffle the rows, as we're going to use the first 1000 rows as training set, and that might not be a random
trump = trump.sample(frac=1, random_state = 10) # This means we are keeping all data but have changed the order
trump


# The list of keys (label to be predicted) for our prediction
source_keys = trump['statusSource']
source_keys

7       1
32      1
311     1
147     0
1458    0
       ..
1261    1
1465    1
573     0
1263    1
1403    0
Name: statusSource, Length: 1390, dtype: int64


# Select the word counts to match the index of the keys
termdoc = termdoc.loc[source_keys.index]
termdoc


# We have to scale the matrix to reduce the power of frequent words (so that all data points will fall into the range of 0-1)
# In this way, the model will work better 
scaled_matrix = StandardScaler().fit_transform(termdoc)
scaled_matrix

array([[-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905],
       [-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905],
       [-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905],
       ...,
       [-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905],
       [-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905],
       [-0.11454053, -0.09056352, -0.08512565, ..., -0.17997929,
        -0.09331813, -0.05534905]])


# Using the first 1000 rows as the training set (note that we've already shuffled the data)
exact_model = LogisticRegression(C = 10000000).fit(scaled_matrix[:1000], source_keys[:1000])


# Then we make predictions using the model we trained (on the same data)
predictions = exact_model.predict(scaled_matrix[:1000])


# Let's see if the predicted results match the true labels
for i in range(20):
    if i in source_keys: # This is necessary because some rows are removed if they don't contain "Android" or "iPhone"
        print(source_keys[i], predictions[i])

0 1
1 1
1 1
0 0
0 0
0 0
1 0
1 1
0 0
1 0
0 0
0 1
0 0
0 1
1 1
1 0
1 1
1 0
1 0


# Calculate the proportion of true predictions
sum(predictions == source_keys[:1000]) / len(predictions)

1.0


# Then we try to make predictions outside the sample we originally trained on
test_predictions = exact_model.predict(scaled_matrix[1000 : , ])
sum(test_predictions == source_keys[1000 : ]) / len(test_predictions)

0.7461538461538462


def colorize(integer):
    if integer == 1:
        return 'Blue'
    else:
        return 'Red'


# First we test the confidence of the model on the training data
probabilities_train_exact = [x[1] for x in exact_model.predict_proba(scaled_matrix[:1000, ])]
# Here, we calculate the probability for the model assigning the data to the second class, namely, "iPhone"
colors_train_exact = [colorize(x) for x in source_keys[: 1000]]

plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_train_exact))), probabilities_train_exact, c = colors_train_exact, alpha = 0.4)
plt.show()


# Then we test the confidence of the model on the testing data
probabilities_test_exact = [x[1] for x in exact_model.predict_proba(scaled_matrix[1000:, ])]
colors_test_exact = [colorize(x) for x in source_keys[1000:]]

plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_test_exact))), probabilities_test_exact, c = colors_test_exact, alpha = 0.4)
plt.show()


# Let's change C value to 0.001 so that blurriness (randomness) is introduced to the model
blurry_model = LogisticRegression(C = 0.001).fit(scaled_matrix[0 : 1000, ], source_keys[0: 1000])


# Make predictions within the sample we originally trained on
predictions = blurry_model.predict(scaled_matrix[0 : 1000, ])
sum(predictions == source_keys[0:1000]) / len(predictions)

0.857


# Then we try to make predictions outside the sample we originally trained on
test_predictions = blurry_model.predict(scaled_matrix[1000 : , ])
sum(test_predictions == source_keys[1000 : ]) / len(test_predictions)

0.8


# First we test the confidence of the model on the training data using the blurry model
probabilities_train_blurry = [x[1] for x in blurry_model.predict_proba(scaled_matrix[:1000, ])]
colors_train_blurry = [colorize(x) for x in source_keys[: 1000]]

plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_train_blurry))), probabilities_train_blurry, c = colors_train_blurry, alpha = 0.4)
plt.show()


# Then we test the confidence of the model on the testing data using the blurry model
probabilities_test_blurry = [x[1] for x in blurry_model.predict_proba(scaled_matrix[1000:, ])]
colors_test_blurry = [colorize(x) for x in source_keys[1000:]]

plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_test_blurry))), probabilities_test_blurry, c = colors_test_blurry, alpha = 0.4)
plt.show()


blurry_model1 = LogisticRegression(C = 0.00001).fit(scaled_matrix[0 : 1000, ], source_keys[0: 1000])
predictions1 = blurry_model1.predict(scaled_matrix[0 : 1000, ])
print(sum(predictions1 == source_keys[0:1000]) / len(predictions1))
test_predictions1 = blurry_model1.predict(scaled_matrix[1000 : , ])
print(sum(test_predictions1 == source_keys[1000 : ]) / len(test_predictions1))
probabilities_train_blurry1 = [x[1] for x in blurry_model1.predict_proba(scaled_matrix[:1000, ])]
colors_train_blurry1 = [colorize(x) for x in source_keys[: 1000]]
plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_train_blurry1))), probabilities_train_blurry1, c = colors_train_blurry1, alpha = 0.4)
plt.show()




blurry_model2 = LogisticRegression(C = 0.01).fit(scaled_matrix[0 : 1000, ], source_keys[0: 1000])
predictions2 = blurry_model2.predict(scaled_matrix[0 : 1000, ])
print(sum(predictions2 == source_keys[0:1000]) / len(predictions2))
test_predictions2 = blurry_model2.predict(scaled_matrix[1000 : , ])
print(sum(test_predictions2 == source_keys[1000 : ]) / len(test_predictions2))
probabilities_train_blurry2 = [x[1] for x in blurry_model2.predict_proba(scaled_matrix[:1000, ])]
colors_train_blurry2 = [colorize(x) for x in source_keys[: 1000]]
plt.figure(figsize = (8, 6))
plt.scatter(list(range(len(probabilities_train_blurry2))), probabilities_train_blurry2, c = colors_train_blurry2, alpha = 0.4)
plt.show()

0.565
0.5051282051282051

0.96
0.7974358974358975


trump = pd.read_csv("Data/trump.csv")
vectorizer2 = CountVectorizer(max_features = 5000) 
sparse_matrix2 = vectorizer2.fit_transform(trump['text']) 
termdoc2 = pd.DataFrame(sparse_matrix2.toarray(), columns=vectorizer2.get_feature_names_out())
termdoc2 = termdoc2.loc[source_keys.index]
scaled_matrix2 = StandardScaler().fit_transform(termdoc2)
blurry_model3 = LogisticRegression(C = 0.001).fit(scaled_matrix2[0 : 1000, ], source_keys[0: 1000])
predictions3 = blurry_model3.predict(scaled_matrix2[0 : 1000, ])
print(sum(predictions3 == source_keys[0:1000]) / len(predictions3))
test_predictions3 = blurry_model3.predict(scaled_matrix2[1000 : , ])
print(sum(test_predictions3 == source_keys[1000 : ]) / len(test_predictions3))

0.996
0.8333333333333334


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words_used = stopwords.words('english')

trump = pd.read_csv("Data/trump.csv")
vectorizer_stopword = CountVectorizer(max_features = 500, stop_words = stop_words_used) 
sparse_matrix_stopword = vectorizer_stopword.fit_transform(trump['text'])
termdoc_stopword = pd.DataFrame(sparse_matrix_stopword.toarray(), columns=vectorizer_stopword.get_feature_names_out())
termdoc_stopword = termdoc_stopword.loc[source_keys.index]
scaled_matrix_stopword = StandardScaler().fit_transform(termdoc_stopword)
blurry_model_stopword = LogisticRegression(C = 0.001).fit(scaled_matrix_stopword[0 : 1000, ], source_keys[0: 1000])
probabilities_train_blurry_stopword = [x[1] for x in blurry_model_stopword.predict_proba(scaled_matrix[:1000, ])]
predictions_stopword = blurry_model_stopword.predict(scaled_matrix_stopword[0 : 1000, ])
print(sum(predictions_stopword == source_keys[0:1000]) / len(predictions_stopword))
test_predictions_stopword = blurry_model_stopword.predict(scaled_matrix_stopword[1000 : , ])
print(sum(test_predictions_stopword == source_keys[1000 : ]) / len(test_predictions_stopword))

0.87
0.8051282051282052

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\colto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reload the original model. This is unnecessary if you always give the new variables new names
trump = pd.read_csv("Data/trump.csv")
vectorizer = CountVectorizer(max_features = 500)
sparse_matrix = vectorizer.fit_transform(trump['text'])
termdoc = pd.DataFrame(sparse_matrix.toarray(), columns=vectorizer.get_feature_names_out())
termdoc = termdoc.loc[source_keys.index]
scaled_matrix = StandardScaler().fit_transform(termdoc)
blurry_model = LogisticRegression(C = 0.001).fit(scaled_matrix[0 : 1000, ], source_keys[0: 1000])


coefs = blurry_model.coef_
coefs = zip(coefs[0], vectorizer.get_feature_names_out()) # load the words and their weights into dictionary
coefs = sorted(coefs) # sort to find the top words that lead to differentiation


# words that suggest posted via iPhone
coefs[0:10]

[(-0.10342277040591821, 'realdonaldtrump'),
 (-0.05844929699548615, 'and'),
 (-0.05605521539372897, 'the'),
 (-0.04975030980032629, 'is'),
 (-0.0462910712486807, 'trump'),
 (-0.04098790340428868, 'are'),
 (-0.03871925237252498, 'that'),
 (-0.03614937712062732, '00'),
 (-0.0350105049412957, 'megynkelly'),
 (-0.03463746227791514, 'than')]


# words that suggest posted via Android
coefs[-10:]

[(0.031520462726903314, 'trumppence16'),
 (0.031624234496627636, 'imwithyou'),
 (0.03243869069982664, 'amp'),
 (0.03763946984603057, 'americafirst'),
 (0.040955688456980235, 'join'),
 (0.043749569510282756, 'makeamericagreatagain'),
 (0.060279250989956214, 'thank'),
 (0.07920734938584943, 'trump2016'),
 (0.12618188881786746, 'co'),
 (0.12719236926526017, 'https')]


# Convert retweet number to two categories
def map_retweet(number):
    if number <= 5000: 
        return 0
    else:
        return 1
trump['retweetCount']= trump['retweetCount'].map(map_retweet)
trump = trump.sample(frac=1, random_state = 10) 
trump


# Regenerate the word matrix based on the new dataset
source_keys_retweet = trump['retweetCount']
vectorizer = CountVectorizer(max_features = 500)
sparse_matrix = vectorizer.fit_transform(trump['text'])
termdoc = pd.DataFrame(sparse_matrix.toarray(), columns=vectorizer.get_feature_names_out())
termdoc_retweet = termdoc.loc[source_keys_retweet.index]
scaled_matrix_retweet= StandardScaler().fit_transform(termdoc_retweet)
scaled_matrix_retweet

array([[-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253],
       [-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253],
       [-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253],
       ...,
       [-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253],
       [-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253],
       [-0.10976426, -0.08680428, -0.08560635, ..., -0.1762674 ,
        -0.09312598, -0.05306253]])


source_keys_retweet5 = trump['retweetCount']
vectorizer5 = CountVectorizer(max_features = 5000)
sparse_matrix5 = vectorizer5.fit_transform(trump['text'])
termdoc5 = pd.DataFrame(sparse_matrix5.toarray(), columns=vectorizer5.get_feature_names_out())
termdoc_retweet5 = termdoc5.loc[source_keys_retweet5.index]
scaled_matrix_retweet5= StandardScaler().fit_transform(termdoc_retweet5)

retweet_blurry_model = LogisticRegression(C = 0.001).fit(scaled_matrix_retweet5[0 : 1000, ], source_keys[0: 1000])
predictions_retweet = retweet_blurry_model.predict(scaled_matrix_retweet5[1000: , ])
print(sum(predictions_retweet == source_keys_retweet5[1000 : ]) / len(predictions_retweet))

0.458984375

	Unnamed: 0.1	Unnamed: 0	text	favorited	favoriteCount	replyToSN	created	truncated	replyToSID	id	replyToUID	statusSource	screenName	retweetCount	isRetweet	retweeted	longitude	latitude
0	0	1	My economic policy speech will be carried live...	False	9214	NaN	2016-08-08 15:20:44	False	NaN	762669882571980801	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	3107	False	False	NaN	NaN
1	1	2	Join me in Fayetteville, North Carolina tomorr...	False	6981	NaN	2016-08-08 13:28:20	False	NaN	762641595439190016	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	2390	False	False	NaN	NaN
2	2	3	#ICYMI: "Will Media Apologize to Trump?" https...	False	15724	NaN	2016-08-08 00:05:54	False	NaN	762439658911338496	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	6691	False	False	NaN	NaN
3	3	4	Michael Morell, the lightweight former Acting ...	False	19837	NaN	2016-08-07 23:09:08	False	NaN	762425371874557952	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	6402	False	False	NaN	NaN
4	4	5	The media is going crazy. They totally distort...	False	34051	NaN	2016-08-07 21:31:46	False	NaN	762400869858115588	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	11717	False	False	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1507	1507	1508	"@constant4change: Trump tops Dem candidates o...	False	2590	NaN	2015-12-20 08:21:23	False	NaN	678490367285678081	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	990	False	False	NaN	NaN
1508	1508	1509	"@autumnandews08: @realDonaldTrump @jonkarl Hi...	False	3550	NaN	2015-12-20 05:25:13	False	NaN	678446032599040001	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	1146	False	False	NaN	NaN
1509	1509	1510	"@DomineekSmith: @realDonaldTrump is the best ...	False	3719	NaN	2015-12-20 05:11:04	False	NaN	678442470720577537	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	1151	False	False	NaN	NaN
1510	1510	1511	Another great accolade for @TrumpGolf. Highly ...	False	2304	NaN	2015-12-14 21:11:12	False	NaN	676509769562251264	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	713	False	False	NaN	NaN
1511	1511	1512	Record of Health: https://t.co/ZDDDawwYVl\n#Ma...	False	2599	NaN	2015-12-14 20:09:15	False	NaN	676494179216805888	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	952	False	False	NaN	NaN

	Unnamed: 0.1	Unnamed: 0	text	favorited	favoriteCount	replyToSN	created	truncated	replyToSID	id	replyToUID	statusSource	screenName	retweetCount	isRetweet	retweeted	longitude	latitude
0	0	1	My economic policy speech will be carried live...	False	9214	NaN	2016-08-08 15:20:44	False	NaN	762669882571980801	NaN	0	realDonaldTrump	3107	False	False	NaN	NaN
1	1	2	Join me in Fayetteville, North Carolina tomorr...	False	6981	NaN	2016-08-08 13:28:20	False	NaN	762641595439190016	NaN	1	realDonaldTrump	2390	False	False	NaN	NaN
2	2	3	#ICYMI: "Will Media Apologize to Trump?" https...	False	15724	NaN	2016-08-08 00:05:54	False	NaN	762439658911338496	NaN	1	realDonaldTrump	6691	False	False	NaN	NaN
3	3	4	Michael Morell, the lightweight former Acting ...	False	19837	NaN	2016-08-07 23:09:08	False	NaN	762425371874557952	NaN	0	realDonaldTrump	6402	False	False	NaN	NaN
4	4	5	The media is going crazy. They totally distort...	False	34051	NaN	2016-08-07 21:31:46	False	NaN	762400869858115588	NaN	0	realDonaldTrump	11717	False	False	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1507	1507	1508	"@constant4change: Trump tops Dem candidates o...	False	2590	NaN	2015-12-20 08:21:23	False	NaN	678490367285678081	NaN	0	realDonaldTrump	990	False	False	NaN	NaN
1508	1508	1509	"@autumnandews08: @realDonaldTrump @jonkarl Hi...	False	3550	NaN	2015-12-20 05:25:13	False	NaN	678446032599040001	NaN	0	realDonaldTrump	1146	False	False	NaN	NaN
1509	1509	1510	"@DomineekSmith: @realDonaldTrump is the best ...	False	3719	NaN	2015-12-20 05:11:04	False	NaN	678442470720577537	NaN	0	realDonaldTrump	1151	False	False	NaN	NaN
1510	1510	1511	Another great accolade for @TrumpGolf. Highly ...	False	2304	NaN	2015-12-14 21:11:12	False	NaN	676509769562251264	NaN	1	realDonaldTrump	713	False	False	NaN	NaN
1511	1511	1512	Record of Health: https://t.co/ZDDDawwYVl\n#Ma...	False	2599	NaN	2015-12-14 20:09:15	False	NaN	676494179216805888	NaN	1	realDonaldTrump	952	False	False	NaN	NaN

	Unnamed: 0.1	Unnamed: 0	text	favorited	favoriteCount	replyToSN	created	truncated	replyToSID	id	replyToUID	statusSource	screenName	retweetCount	isRetweet	retweeted	longitude	latitude
7	7	8	.@Larry_Kudlow - 'Donald Trump Is the middle-c...	False	19543	NaN	2016-08-07 02:03:39	False	NaN	762106904436961280	NaN	1	realDonaldTrump	7930	False	False	NaN	NaN
32	32	33	Thank you to the amazing law enforcement offic...	False	34588	NaN	2016-08-03 22:10:11	False	NaN	760960985524043777	NaN	1	realDonaldTrump	10168	False	False	NaN	NaN
311	311	312	#MakeAmericaGreatAgain #ImWithYou\nhttps://t.c...	False	15279	NaN	2016-07-12 02:12:19	False	NaN	752686998826131456	NaN	1	realDonaldTrump	5392	False	False	NaN	NaN
147	147	148	Crooked Hillary Clinton wants to flood our cou...	False	54206	NaN	2016-07-27 10:08:35	False	NaN	758242674646323200	NaN	0	realDonaldTrump	16640	False	False	NaN	NaN
1458	1458	1459	"@creta_r: @realDonaldTrump Looking forward to...	False	6439	NaN	2016-01-28 04:26:38	False	NaN	692564413811941376	NaN	0	realDonaldTrump	2099	False	False	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1261	1261	1262	The Better Business Bureau report, with an A r...	False	18624	NaN	2016-03-04 04:30:45	False	NaN	705611414128500736	NaN	1	realDonaldTrump	8612	False	False	NaN	NaN
1465	1465	1466	I am self funding my campaign so I do not owe ...	False	13385	NaN	2016-01-27 21:21:04	False	NaN	692457315941900288	NaN	1	realDonaldTrump	4696	False	False	NaN	NaN
573	573	574	"@southpaw816: @SenSanders fans, greatest way ...	False	10389	NaN	2016-06-08 12:00:55	False	NaN	740513939574951936	NaN	0	realDonaldTrump	3429	False	False	NaN	NaN
1263	1263	1264	Senator Sessions will serve as the \nChairman ...	False	14258	NaN	2016-03-04 01:20:00	False	NaN	705563410528915457	NaN	1	realDonaldTrump	6281	False	False	NaN	NaN
1403	1403	1404	Loved the debate last night, and almost everyo...	False	11297	NaN	2016-02-14 20:55:59	False	NaN	698973988891009024	NaN	0	realDonaldTrump	3305	False	False	NaN	NaN

	Unnamed: 0.1	Unnamed: 0	text	favorited	favoriteCount	replyToSN	created	truncated	replyToSID	id	replyToUID	statusSource	screenName	retweetCount	isRetweet	retweeted	longitude	latitude
339	339	340	Even the once great Caesars is bankrupt in A.C...	False	14936	NaN	2016-07-06 13:24:50	False	NaN	750681915565477888	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	0	False	False	NaN	NaN
146	146	147	Our not very bright Vice President, Joe Biden,...	False	38199	NaN	2016-07-27 12:57:20	False	NaN	758285141982711808	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	1	False	False	NaN	NaN
1356	1356	1357	Ted Cruz should be disqualified from his fraud...	False	8287	NaN	2016-02-22 21:11:17	False	NaN	701876939095543808	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	0	False	False	NaN	NaN
1315	1315	1316	"@itsblakec: @realDonaldTrump Trump is a geniu...	False	7750	NaN	2016-02-27 04:53:34	False	NaN	703442830211964928	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	0	False	False	NaN	NaN
1447	1447	1448	"@stephbewitching @realDonaldTrump All you hat...	False	10449	NaN	2016-02-02 19:44:56	False	NaN	694607450750717952	NaN	<a href="http://twitter.com" rel="nofollow">Tw...	realDonaldTrump	0	False	False	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1393	1393	1394	JEB is a hypocrite! Used massive private "Emin...	False	7296	NaN	2016-02-15 17:29:40	False	NaN	699284455282884612	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	0	False	False	NaN	NaN
1344	1344	1345	THANK YOU LAS VEGAS, NEVADA!\n#NevadaCaucus #V...	False	8866	NaN	2016-02-23 05:19:29	False	NaN	701999802033111040	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	0	False	False	NaN	NaN
527	527	528	Join me in Atlanta on Wednesday- at noon! #Tru...	False	12617	NaN	2016-06-14 00:20:57	False	NaN	742512112614944768	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	0	False	False	NaN	NaN
1149	1149	1150	So the highly overrated anchor, @megynkelly, i...	False	27373	NaN	2016-03-20 19:10:55	False	NaN	711631122036293632	NaN	<a href="http://twitter.com/download/android" ...	realDonaldTrump	1	False	False	NaN	NaN
1289	1289	1290	My wife @MELANIATRUMP will be #OnTheRecord w/ ...	False	9731	NaN	2016-03-02 19:54:30	False	NaN	705119106052308992	NaN	<a href="http://twitter.com/download/iphone" r...	realDonaldTrump	0	False	False	NaN	NaN

Preprocessing¶

Machine Learning: Train an Exact Model¶

Machine Learning: Train an Blurred Model¶

Task 1¶

Task 2¶

Task 3¶

Finding the Features That Lead to Differentiation¶

Task 4¶

Classifying Based on Retweet Numbers¶

Task 5¶