import pandas as pd # dealing with dataframe
import json # dealing with json datafiles


# Load the json datafile
file_path = 'Data/loc.ark+=13960=t6737fd9d.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
print(data)


# Word frequency data of each page
data['features']['pages'][60]['body']


# Save the frequency of each word in each page to a dataframe
page_list = [] # empty lists to record the information
token_list = []
count_list = []
for i in range(len(data['features']['pages'])): # loop through each page
    if data['features']['pages'][i]['body'] is not None: # if that page has word frequency information
        for token in data['features']['pages'][i]['body']['tokenPosCount']: # loop through each word
            token_count = 0
            for pos_keys in data['features']['pages'][i]['body']['tokenPosCount'][token]: # add up the total occurences of that word
                token_count += data['features']['pages'][i]['body']['tokenPosCount'][token][pos_keys]
            page_list.append(i+1) # add one to page number because there is no page 0
            token_list.append(token) # add the word
            count_list.append(token_count) # add the frequency of the word
word_count_by_page = pd.DataFrame({
    'Page': page_list,
    'Token': token_list,
    'Count': count_list 
}) # save the data to a dataframe
word_count_by_page


# Group words into lower cases
word_count_by_page = word_count_by_page.groupby([word_count_by_page['Token'].str.lower(), 'Page'])['Count'].sum().reset_index()
word_count_by_page


# Remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

word_count_by_page = word_count_by_page.drop(word_count_by_page[word_count_by_page['Token'].isin(stop_words)].index).reset_index(drop=True)
word_count_by_page


# Remove punctuations
word_count_by_page = word_count_by_page.drop(word_count_by_page[word_count_by_page['Token'].str.contains('[^A-Za-z\s]', regex=True)].index).reset_index(drop=True)
word_count_by_page


# Add act number information based on page (these boundaries are created based on close examination)
def add_act_number(page):
    if page >= 53 and page <= 74:
        return "Act I"
    elif page >= 76 and page <= 108:
        return "Act II"
    elif page >= 109 and page <= 139:
        return "Act III"
    elif page >= 140 and page <= 164:
        return "Act IV"
    elif page >= 165 and page <= 179:
        return "Act V"
    
word_count_by_page['Act'] = word_count_by_page['Page'].apply(add_act_number)
word_count_by_page


# Keep the word frequency in each act only
word_count_by_act = word_count_by_page[word_count_by_page['Act'].notna()]
word_count_by_act


# Sum word counts for each act
word_count_by_act = word_count_by_act.groupby(['Act', 'Token'])[['Count']].sum().reset_index()
word_count_by_act


# Rename the columns so that they’re consistent with the TF-IDF vocabulary that we’ve been using
word_frequency_df  = word_count_by_act.rename(columns={'Token': 'term', 'Count': 'term_frequency'})
word_frequency_df


# Create a separate DataFrame by adding up how many acts each term appears
document_frequency_df = (word_frequency_df.groupby(['Act', 'term']).size().unstack()).sum().reset_index()
document_frequency_df = document_frequency_df.rename(columns={0:'document_frequency'})
document_frequency_df


# Merge the dataframes together, so that for each term in each act, we got its term frequency in that act, and how many acts
# the term appears in the whole play
word_frequency_df = word_frequency_df.merge(document_frequency_df)
word_frequency_df


# Calculate total number of acts for inverse document frequency
total_number_of_acts = word_frequency_df['Act'].nunique()
total_number_of_acts


# Calculate inverse document frequency
import numpy as np # performing calculations on arrays
word_frequency_df['idf'] = np.log((total_number_of_acts) / (word_frequency_df['document_frequency'])) + 1
word_frequency_df


# Calculate TF-IDF scores
word_frequency_df['tfidf'] = word_frequency_df['term_frequency'] * word_frequency_df['idf']
word_frequency_df


# Sort the dataframe to get top 5 words with highest TF-IDF scores in each act
word_frequency_df.sort_values(by=['Act', 'tfidf'], ascending=[True,False]).groupby(['Act']).head(5)


# Compute the top one word with the highest TF-IDF score for each page (instead of act).
# Use "word_count_by_page" dataframe and copy the codes to produce the word frequency for each page,
# then generate the document frequency of each word, merge the two dataframes, calculate the total number of pages,
# compute IDF, TF-IDF, and finally, sort by Page and TF-IDF. Group by "Page" and select the top 1 word for each page.


# Load the dataset of US inaugural addresses
US_inaugural = pd.read_csv('Data/US_Inaugural_Addresses.csv')
US_inaugural


# Initialize TfidfVectorizer, using English stopwords and converting words to lowercase
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)


# Generate a dataframe of tfidf values using TfidfVectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(US_inaugural['Text']) # Generate a matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()) # Convert matrix to dataframe
tfidf_df.set_index(US_inaugural['Title'], inplace=True) # Replace the index to be the name of the inaugural speeches
tfidf_df


# Reorganize the DataFrame so that the words are in rows rather than columns
tfidf_df.stack().reset_index()


# Calculate the word with highest TF-IDF score in each inaugural address
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'Title': 'document','level_1': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(1)


# Compute the top one word with the highest TF-IDF score for each inaugural address without including stopwords.
# Define a new tfidfvectorizer without including stopwords, and copy the codes with that new tfidfvectorizer to fit_transform,
# convert the generated matrix to a DataFrame, set the index as title of the address, and reorganize the dataframe,
# then rename the columns, and finally sort values and select the top 1 word of each inaugural address
# (Don't be surprised if you find they are mostly the same word. That's why stop words removal is important!)


# for declarative statistical visualization
!pip install altair


# Some fancy visualizations to highlight the words with highest TF-IDF score in each inaugural address
import altair as alt

top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)


# Install VADER Sentiment analysis
!pip install vaderSentiment


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER so we can use it later
sentimentAnalyser = SentimentIntensityAnalyzer()


sentimentAnalyser.polarity_scores("I like sentiment analysis")


sentimentAnalyser.polarity_scores("I don't like sentiment analysis")


# Calculate sentiment score for a text
def calculate_sentiment(text):
    # Run VADER on the text
    scores = sentimentAnalyser.polarity_scores(text)
    # Extract the compound score
    compound_score = scores['compound']
    # Return compound score
    return compound_score


trump = pd.read_csv("Data/trump.csv")

Introduction to TF-IDF¶

TF-IDF: Preprocessing¶

TF-IDF: Build Model¶

Task 1¶

Task 2¶

TF-IDF with Scikit-Learn¶

Task 3¶

Task 4¶

Sentiment Analysis¶

Task 5¶