import pandas as pd
import numpy as np
import re
! pip install nltk
import nltk
nltk.download('stopwords')
# Gensim, for topic modeling
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline
Requirement already satisfied: nltk in c:\users\colto\anaconda3\lib\site-packages (3.8.1) Requirement already satisfied: click in c:\users\colto\anaconda3\lib\site-packages (from nltk) (8.0.4) Requirement already satisfied: joblib in c:\users\colto\anaconda3\lib\site-packages (from nltk) (1.2.0) Requirement already satisfied: regex>=2021.8.3 in c:\users\colto\anaconda3\lib\site-packages (from nltk) (2022.7.9) Requirement already satisfied: tqdm in c:\users\colto\anaconda3\lib\site-packages (from nltk) (4.65.0) Requirement already satisfied: colorama in c:\users\colto\anaconda3\lib\site-packages (from click->nltk) (0.4.6)
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\colto\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\stopwords.zip.
df = pd.read_csv("Data/YoutubeVideoEssayProject.csv")
df = df.drop(columns ="Unnamed: 9")
df
| Title | Views | Likes | Date | Length | Transcript | Description | Creator | Creator Subscribers | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | David Lynch: The Treachery of Language | 275,647 | 14,000 | 3/30/18 | 11:10 | 0:00\n[Interviewer] “David Lynch has described... | David Lynch is famous for his reluctance to ve... | What's So Great About That? | 79,900 |
| 1 | CTRL+ALT+DEL | SLA:3 | 2,400,000 | 83,000 | 4/27/18 | 34:10 | 0:00\nHi, I'm Hareton Splimby, and welcome to ... | In attempting to go fast, Hareton Splimby suff... | hbomberguy | 1,290,000 |
| 2 | The Hobbit: A Long-Expected Autopsy (Part 1/2) | 5,000,000 | 126,000 | 3/28/18 | 36:48 | 0:07\nIn mid-2015, less than a year before her... | In which we look back at The Hobbit trilogy an... | Lindsay Ellis | 5,086,953 |
| 3 | Making Games Better for Gamers with Colourblin... | 370,306 | 19,000 | 8/22/18 | 13:55 | 0:00\nVideo games are for everyone, and they c... | Video games are for everyone. But disabled peo... | Game Maker's Toolkit | 1,510,000 |
| 4 | FAKE FRIENDS EPISODE TWO: parasocial hell | 224,898 | 10,000 | 8/11/18 | 1:54:34 | 0:03\n[Shannon] Grape-kun was a real-life peng... | it's done!!!!\n\n \n\n / struccimovies \nhttp... | StrucciMovies | 46,700 |
| 5 | Incels | ContraPoints | 5,823,697 | 243,000 | 8/17/18 | 34:05 | 0:00\n[Mendelssohn: String Quartet No. 6 in F ... | Hello boys. Let's talk about bone structure.\n... | ContraPoints | 5,823,697 |
| 6 | DOOM: The Fake Outrage | 845,440 | 38,000 | 9/1/18 | 24:32 | 0:00\nhello everyone today we're going to be\n... | Countdown to the first accusation of meta-meta... | Shaun | 662,000 |
| 7 | Disney - The Magic of Animation | 610,400 | 45,000 | 10/3/18 | 15:47 | 0:00\nforeign\n0:07\nwhat is it about Disney A... | A look at the 12 principles of animation devel... | kaptainkristian | 45,000 |
| 8 | Nostalghia Critique | 106,500 | 5,400 | 11/27/18 | 9:11 | 0:07\nThere are a few things you can't do on Y... | A reflection on cinema, self, and other nonsen... | KyleKallgreen | 80,800 |
| 9 | In Search Of A Flat Earth | 3,423,150 | 129,000 | 9/11/20 | 1:16:16 | Prologue\n0:00\n[Laid back folk music] A few m... | Clickbait Title: The Twist at 37 Minutes Will ... | Folding Ideas | 892,000 |
| 10 | The Satirical Resurgence of Reefer Madness | 81,974 | 6,900 | 11/10/20 | 26:58 | Transcript | https://www.snap4freedom.org/home\n \n\n / yha... | Yhara zayd | 242,000 |
| 11 | The Strange Reality of Roller Coaster Tycoon | 1,383,521 | 55,000 | 7/19/20 | 18:11 | 0:09\nThere is at least one roller coaster des... | Both birds are yellow but the louder one is ye... | Jacob Geller | 1,150,000 |
| 12 | CATS & The Weird Mind of TS Eliot | An Analysis | 334,188 | 16,000 | 3/24/20 | 58:50 | 0:12\nthe speaking on eliot is a difficult mat... | If you want to directly support me and see thi... | Maggie Mae Fish | 221,000 |
| 13 | The Anatomy of Stan Culture | 108,681 | 8,200 | 3/8/20 | 18:28 | 0:00\nthe audience is important to any\n0:01\n... | How much do you love celebrities? As a fan, wh... | Intelexual Media | 281,000 |
| 14 | On Writing: Mental Illness in Video Games | a ... | 246,532 | 21,000 | 4/3/20 | 33:33 | 0:00\nthis video is going to deal with sensiti... | It's only because of independent support throu... | Hello Future Me | 1,008,000 |
| 15 | Why Anime is for Black People - Hip Hop x Anim... | 180,348 | 15,000 | 9/25/20 | 18:34 | 0:00\nanybody who's been alive in the past 20\... | Over the years, it's hard not to point out ani... | Beyond The Bot | 84,100 |
| 16 | What Is *Good* Queer Representation in 2020? | 181,933 | 12,000 | 8/14/20 | 48:21 | 0:00\nthis was actually a really difficult vid... | Clips and Spoilers from the following shows:\n... | Princess Weekes | 222,000 |
| 17 | Fallout: New Vegas Is Genius, And Here's Why | 9,754,066 | 251,000 | 12/19/20 | 1:37:41 | 0:00\n- The first two "Fallouts" are still two... | (Spoilers for New Vegas, obviously)\n\nMy Twit... | hbomberguy | 1,290,000 |
| 18 | Whisper of the Heart: How Does It Feel to Be a... | 126,458 | 9,500 | 5/28/20 | 13:30 | 0:03\n"Alright..."\n0:05\n"And action!"\n0:06\... | Accented Cinema - Episode 37\n\nThis is a bit ... | Accented Cinema | 456,000 |
| 19 | Your Island is a Commune pt. 1 | Nowhere Grotesk | 87,724 | 7,300 | 1/11/20 | 22:31 | 0:00\nI've never engaged with the game the way... | Capitalism or community, choose one. Join us n... | Nowhere Grotesk | 8,510 |
| 20 | The Market of Humiliating Black Women | 939,173 | 102,000 | 2/22/21 | 26:11 | 0:00\nHi everyone. Welcome to this video. My ... | Hey cuties welcome back :) In today's video I'... | Tee Noir | 631,000 |
| 21 | The Day Rue "Became" Black | 1,965,436 | 126,000 | 5/19/21 | 35:33 | 0:00\nGreetings and salutations, before we get... | Go to https://nordvpn.com/yhara or use code yh... | Yhara zayd | 242,000 |
| 22 | Infantilization and the Body Hair Debate | 1,930,899 | 140,000 | 8/14/21 | 35:40 | 1:09\nSo, I got Brazilian wax recently. [cric... | it took 13 hours to edit this.\nSOCIALS:\nko-f... | Shanspeare | 590,000 |
| 23 | Bo Burnham vs. Jeff Bezos | 1,345,904 | 80,000 | 8/20/21 | 2:26:41 | 0:01\n[ Screams ] Every book of the Bible was ... | COVID make man sad\n\nPatreon: \n\n / cjthex ... | CJ The X | 297,000 |
| 24 | The reign of the Slim-Thick Influencer | Khadi... | 1,886,067 | 114,000 | 8/22/21 | 54:18 | 0:00\nThis video is brought to you by Squaresp... | Head to https://www.squarespace.com/khadijamb... | Khadija Mbowe | 611,000 |
| 25 | make more characters bi, you cowards: why (not... | 8,217 | NaN | 8/28/21 | 51:16 | 0:00\nhello void and all who inhabit it, it's ... | sorry again for the glitchy video. the perks o... | voice memos for the void | 10,000 |
| 26 | The Black Right Wing || Anansi’s Library | 26,220 | 2,400 | 9/24/21 | 8:17 | 0:00\nour justice system is gone blm has taken... | NEW TWITTER: \n\n / localpunkanansi NEW PAT... | Anansi's Library | 25,200 |
| 27 | On Leftist Disunity | 72,060 | 7,200 | 10/27/21 | 11:34 | 0:00\nNot long after I’d become radicalized, b... | Watch the whole video before commenting. As I’... | Andrewism | 161,000 |
| 28 | Break Bread | 751,463 | 50,000 | 12/6/21 | 1:40:30 | 0:04\nso in the following years as the anti-sj... | After a few months of “success” on the platfor... | F.D Signifier | 611,000 |
| 29 | Meet Dave | Captain Ahab: The Story of Dave St... | 828,195 | 18,000 | 3/1/22 | 48:33 | 0:00\n(pulsing music)\n0:05\n- [Jon] It's the ... | “Who’s Dave Stieb?” you might be asking. Well,... | Secret Base | 1,360,000 |
| 30 | Why Panzer Dragoon Saga is the Greatest RPG No... | 112,523 | 6,700 | 6/24/22 | 53:30 | 0:02\nearth may not be forever but we still ha... | This is the story of the most important and in... | Micheal Saba | 86,300 |
| 31 | Nice White Teachers, Bad Brown Schools: Hollyw... | 331,890 | 22,000 | 6/25/22 | 40:53 | 0:18\nthis video is brought to you by mubi a c... | Hirokazu Koreeda: A Double Bill is now streami... | Yhara zayd | 242,000 |
| 32 | Instagram Hates Its Users | 1,001,437 | 57,000 | 8/31/22 | 31:33 | 0:00\neveryone is mad at instagram right now a... | Use code JARVIS130 to get $130 off across 6 Fa... | Jarvis Johnson | 2,040,000 |
| 33 | Fixing My Brain with Automated Therapy | 979,429 | 57,000 | 9/2/22 | 52:50 | 0:00\nnow when we hear the word therapy, certa... | People laugh about this, self-soothing engines... | Jacob Geller | 1,150,000 |
| 34 | Parking lots are everywhere and nowhere | 13,904 | 1,200 | 9/22/22 | 11:34 | 0:00\nWe all have those formative childhood ex... | Parking lots are the ambient architecture of a... | What's So Great About That? | 79,900 |
| 35 | How Degrowth Can Save The World | 136,004 | 9,700 | 11/2/22 | 36:54 | 0:00\nour world is dying. Or more accurately, ... | Capitalism is based on the cancerous logic of ... | Andrewism | 161,000 |
#cleaned text in df
def clean_text(list_):
regex_newline = re.compile(r'\n')
regex_timestamp = re.compile(r'\d:\d\d')
regex_whitespace = re.compile(r'\s{2,}')
for index, item in enumerate(list_):
item = str(item)
item = re.sub(regex_newline, " ", item)
item = re.sub(regex_timestamp, " ", item)
item = re.sub(regex_whitespace, " ", item)
item = re.sub(r'"', "", item)
item = re.sub(r'\s1\s', ' ', item)
item = re.sub(r'\s2\s', ' ', item)
list_[index] = item
return list_
df["Transcript"] = clean_text(df["Transcript"])
df["Description"] = clean_text(df["Description"])
df
| Title | Views | Likes | Date | Length | Transcript | Description | Creator | Creator Subscribers | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | David Lynch: The Treachery of Language | 275,647 | 14,000 | 3/30/18 | 11:10 | [Interviewer] “David Lynch has described his ... | David Lynch is famous for his reluctance to ve... | What's So Great About That? | 79,900 |
| 1 | CTRL+ALT+DEL | SLA:3 | 2,400,000 | 83,000 | 4/27/18 | 34:10 | Hi, I'm Hareton Splimby, and welcome to Serio... | In attempting to go fast, Hareton Splimby suff... | hbomberguy | 1,290,000 |
| 2 | The Hobbit: A Long-Expected Autopsy (Part 1/2) | 5,000,000 | 126,000 | 3/28/18 | 36:48 | In mid-2015, less than a year before her deat... | In which we look back at The Hobbit trilogy an... | Lindsay Ellis | 5,086,953 |
| 3 | Making Games Better for Gamers with Colourblin... | 370,306 | 19,000 | 8/22/18 | 13:55 | Video games are for everyone, and they can ev... | Video games are for everyone. But disabled peo... | Game Maker's Toolkit | 1,510,000 |
| 4 | FAKE FRIENDS EPISODE TWO: parasocial hell | 224,898 | 10,000 | 8/11/18 | 1:54:34 | [Shannon] Grape-kun was a real-life penguin t... | it's done!!!! / struccimovies https://ko-fi.co... | StrucciMovies | 46,700 |
| 5 | Incels | ContraPoints | 5,823,697 | 243,000 | 8/17/18 | 34:05 | [Mendelssohn: String Quartet No. 6 in F minor... | Hello boys. Let's talk about bone structure. S... | ContraPoints | 5,823,697 |
| 6 | DOOM: The Fake Outrage | 845,440 | 38,000 | 9/1/18 | 24:32 | hello everyone today we're going to be talkin... | Countdown to the first accusation of meta-meta... | Shaun | 662,000 |
| 7 | Disney - The Magic of Animation | 610,400 | 45,000 | 10/3/18 | 15:47 | foreign what is it about Disney Animation tha... | A look at the 12 principles of animation devel... | kaptainkristian | 45,000 |
| 8 | Nostalghia Critique | 106,500 | 5,400 | 11/27/18 | 9:11 | There are a few things you can't do on YouTub... | A reflection on cinema, self, and other nonsen... | KyleKallgreen | 80,800 |
| 9 | In Search Of A Flat Earth | 3,423,150 | 129,000 | 9/11/20 | 1:16:16 | Prologue [Laid back folk music] A few minutes ... | Clickbait Title: The Twist at 37 Minutes Will ... | Folding Ideas | 892,000 |
| 10 | The Satirical Resurgence of Reefer Madness | 81,974 | 6,900 | 11/10/20 | 26:58 | Transcript | https://www.snap4freedom.org/home / yharazayd ... | Yhara zayd | 242,000 |
| 11 | The Strange Reality of Roller Coaster Tycoon | 1,383,521 | 55,000 | 7/19/20 | 18:11 | There is at least one roller coaster designed... | Both birds are yellow but the louder one is ye... | Jacob Geller | 1,150,000 |
| 12 | CATS & The Weird Mind of TS Eliot | An Analysis | 334,188 | 16,000 | 3/24/20 | 58:50 | the speaking on eliot is a difficult matter, ... | If you want to directly support me and see thi... | Maggie Mae Fish | 221,000 |
| 13 | The Anatomy of Stan Culture | 108,681 | 8,200 | 3/8/20 | 18:28 | the audience is important to any celebrity's ... | How much do you love celebrities? As a fan, wh... | Intelexual Media | 281,000 |
| 14 | On Writing: Mental Illness in Video Games | a ... | 246,532 | 21,000 | 4/3/20 | 33:33 | this video is going to deal with sensitive an... | It's only because of independent support throu... | Hello Future Me | 1,008,000 |
| 15 | Why Anime is for Black People - Hip Hop x Anim... | 180,348 | 15,000 | 9/25/20 | 18:34 | anybody who's been alive in the past 20 years... | Over the years, it's hard not to point out ani... | Beyond The Bot | 84,100 |
| 16 | What Is *Good* Queer Representation in 2020? | 181,933 | 12,000 | 8/14/20 | 48:21 | this was actually a really difficult video to... | Clips and Spoilers from the following shows: S... | Princess Weekes | 222,000 |
| 17 | Fallout: New Vegas Is Genius, And Here's Why | 9,754,066 | 251,000 | 12/19/20 | 1:37:41 | - The first two Fallouts are still two of the... | (Spoilers for New Vegas, obviously) My Twitter... | hbomberguy | 1,290,000 |
| 18 | Whisper of the Heart: How Does It Feel to Be a... | 126,458 | 9,500 | 5/28/20 | 13:30 | Alright... And action! Being an artist myself... | Accented Cinema - Episode 37 This is a bit of ... | Accented Cinema | 456,000 |
| 19 | Your Island is a Commune pt. 1 | Nowhere Grotesk | 87,724 | 7,300 | 1/11/20 | 22:31 | I've never engaged with the game the way I do... | Capitalism or community, choose one. Join us n... | Nowhere Grotesk | 8,510 |
| 20 | The Market of Humiliating Black Women | 939,173 | 102,000 | 2/22/21 | 26:11 | Hi everyone. Welcome to this video. My name i... | Hey cuties welcome back :) In today's video I'... | Tee Noir | 631,000 |
| 21 | The Day Rue "Became" Black | 1,965,436 | 126,000 | 5/19/21 | 35:33 | Greetings and salutations, before we get into... | Go to https://nordvpn.com/yhara or use code yh... | Yhara zayd | 242,000 |
| 22 | Infantilization and the Body Hair Debate | 1,930,899 | 140,000 | 8/14/21 | 35:40 | So, I got Brazilian wax recently. [crickets]... | it took 13 hours to edit this. SOCIALS: ko-fi:... | Shanspeare | 590,000 |
| 23 | Bo Burnham vs. Jeff Bezos | 1,345,904 | 80,000 | 8/20/21 | 2:26:41 | [ Screams ] Every book of the Bible was writt... | COVID make man sad Patreon: / cjthex Twitter: ... | CJ The X | 297,000 |
| 24 | The reign of the Slim-Thick Influencer | Khadi... | 1,886,067 | 114,000 | 8/22/21 | 54:18 | This video is brought to you by Squarespace. ... | Head to https://www.squarespace.com/khadijamb... | Khadija Mbowe | 611,000 |
| 25 | make more characters bi, you cowards: why (not... | 8,217 | NaN | 8/28/21 | 51:16 | hello void and all who inhabit it, it's me an... | sorry again for the glitchy video. the perks o... | voice memos for the void | 10,000 |
| 26 | The Black Right Wing || Anansi’s Library | 26,220 | 2,400 | 9/24/21 | 8:17 | our justice system is gone blm has taken over... | NEW TWITTER: / localpunkanansi NEW PATREON LIN... | Anansi's Library | 25,200 |
| 27 | On Leftist Disunity | 72,060 | 7,200 | 10/27/21 | 11:34 | Not long after I’d become radicalized, began ... | Watch the whole video before commenting. As I’... | Andrewism | 161,000 |
| 28 | Break Bread | 751,463 | 50,000 | 12/6/21 | 1:40:30 | so in the following years as the anti-sjw cha... | After a few months of “success” on the platfor... | F.D Signifier | 611,000 |
| 29 | Meet Dave | Captain Ahab: The Story of Dave St... | 828,195 | 18,000 | 3/1/22 | 48:33 | (pulsing music) - [Jon] It's the final game o... | “Who’s Dave Stieb?” you might be asking. Well,... | Secret Base | 1,360,000 |
| 30 | Why Panzer Dragoon Saga is the Greatest RPG No... | 112,523 | 6,700 | 6/24/22 | 53:30 | earth may not be forever but we still have th... | This is the story of the most important and in... | Micheal Saba | 86,300 |
| 31 | Nice White Teachers, Bad Brown Schools: Hollyw... | 331,890 | 22,000 | 6/25/22 | 40:53 | this video is brought to you by mubi a curate... | Hirokazu Koreeda: A Double Bill is now streami... | Yhara zayd | 242,000 |
| 32 | Instagram Hates Its Users | 1,001,437 | 57,000 | 8/31/22 | 31:33 | everyone is mad at instagram right now and fo... | Use code JARVIS130 to get $130 off across 6 Fa... | Jarvis Johnson | 2,040,000 |
| 33 | Fixing My Brain with Automated Therapy | 979,429 | 57,000 | 9/2/22 | 52:50 | now when we hear the word therapy, certain th... | People laugh about this, self-soothing engines... | Jacob Geller | 1,150,000 |
| 34 | Parking lots are everywhere and nowhere | 13,904 | 1,200 | 9/22/22 | 11:34 | We all have those formative childhood experie... | Parking lots are the ambient architecture of a... | What's So Great About That? | 79,900 |
| 35 | How Degrowth Can Save The World | 136,004 | 9,700 | 11/2/22 | 36:54 | our world is dying. Or more accurately, it is... | Capitalism is based on the cancerous logic of ... | Andrewism | 161,000 |
# Initialize TfidfVectorizer, using English stopwords and converting words to lowercase
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Transcript']) # Generate a matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()) # Convert matrix to dataframe
tfidf_df.set_index(df['Title'], inplace=True)
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'Title': 'document','level_1': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(1)
| document | term | tfidf | |
|---|---|---|---|
| 306869 | Bo Burnham vs. Jeff Bezos | bezos | 0.593870 |
| 374618 | Break Bread | content | 0.307921 |
| 163352 | CATS & The Weird Mind of TS Eliot | An Analysis | eliot | 0.510959 |
| 15631 | CTRL+ALT+DEL | SLA:3 | comic | 0.354365 |
| 83354 | DOOM: The Fake Outrage | doom | 0.477031 |
| 7132 | David Lynch: The Treachery of Language | lynch | 0.663959 |
| 93322 | Disney - The Magic of Animation | action | 0.376898 |
| 58408 | FAKE FRIENDS EPISODE TWO: parasocial hell | grape | 0.437322 |
| 230324 | Fallout: New Vegas Is Genius, And Here's Why | fallout | 0.400686 |
| 442363 | Fixing My Brain with Automated Therapy | eliza | 0.331601 |
| 470344 | How Degrowth Can Save The World | growth | 0.429553 |
| 124277 | In Search Of A Flat Earth | flat | 0.596912 |
| 72416 | Incels | ContraPoints | incels | 0.568979 |
| 297674 | Infantilization and the Body Hair Debate | hair | 0.393264 |
| 431326 | Instagram Hates Its Users | 0.592285 | |
| 42176 | Making Games Better for Gamers with Colourblin... | colour | 0.305557 |
| 396597 | Meet Dave | Captain Ahab: The Story of Dave St... | stieb | 0.667119 |
| 423654 | Nice White Teachers, Bad Brown Schools: Hollyw... | teachers | 0.389858 |
| 108495 | Nostalghia Critique | clips | 0.305741 |
| 365537 | On Leftist Disunity | left | 0.302464 |
| 194845 | On Writing: Mental Illness in Video Games | a ... | player | 0.357307 |
| 453521 | Parking lots are everywhere and nowhere | car | 0.339618 |
| 174633 | The Anatomy of Stan Culture | celebrity | 0.539281 |
| 346796 | The Black Right Wing || Anansi’s Library | black | 0.342267 |
| 289197 | The Day Rue "Became" Black | rue | 0.445351 |
| 32224 | The Hobbit: A Long-Expected Autopsy (Part 1/2) | hobbit | 0.543211 |
| 267080 | The Market of Humiliating Black Women | black | 0.325939 |
| 145030 | The Satirical Resurgence of Reefer Madness | transcript | 1.000000 |
| 148394 | The Strange Reality of Roller Coaster Tycoon | coaster | 0.635670 |
| 320313 | The reign of the Slim-Thick Influencer | Khadi... | body | 0.376040 |
| 221973 | What Is *Good* Queer Representation in 2020? | queer | 0.537510 |
| 249805 | Whisper of the Heart: How Does It Feel to Be a... | shizuku | 0.315560 |
| 199964 | Why Anime is for Black People - Hip Hop x Anim... | anime | 0.524849 |
| 402265 | Why Panzer Dragoon Saga is the Greatest RPG No... | dragoon | 0.515901 |
| 265197 | Your Island is a Commune pt. 1 | Nowhere Grotesk | village | 0.433637 |
| 333442 | make more characters bi, you cowards: why (not... | bi | 0.721916 |
!pip install altair
Requirement already satisfied: altair in c:\users\colto\anaconda3\lib\site-packages (5.1.2) Requirement already satisfied: jinja2 in c:\users\colto\anaconda3\lib\site-packages (from altair) (3.1.2) Requirement already satisfied: jsonschema>=3.0 in c:\users\colto\anaconda3\lib\site-packages (from altair) (4.17.3) Requirement already satisfied: numpy in c:\users\colto\anaconda3\lib\site-packages (from altair) (1.24.3) Requirement already satisfied: packaging in c:\users\colto\anaconda3\lib\site-packages (from altair) (23.1) Requirement already satisfied: pandas>=0.25 in c:\users\colto\anaconda3\lib\site-packages (from altair) (2.0.3) Requirement already satisfied: toolz in c:\users\colto\anaconda3\lib\site-packages (from altair) (0.12.0) Requirement already satisfied: attrs>=17.4.0 in c:\users\colto\anaconda3\lib\site-packages (from jsonschema>=3.0->altair) (22.1.0) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\users\colto\anaconda3\lib\site-packages (from jsonschema>=3.0->altair) (0.18.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\colto\anaconda3\lib\site-packages (from pandas>=0.25->altair) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\colto\anaconda3\lib\site-packages (from pandas>=0.25->altair) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\colto\anaconda3\lib\site-packages (from pandas>=0.25->altair) (2023.3) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\colto\anaconda3\lib\site-packages (from jinja2->altair) (2.1.1) Requirement already satisfied: six>=1.5 in c:\users\colto\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.25->altair) (1.16.0)
# Some fancy visualizations to highlight the words with highest TF-IDF score in each inaugural address
import altair as alt
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
# Terms in this list will get a red dot in the visualization
term_list = ['queer', 'peace']
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001
# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
x = 'rank:O',
y = 'document:N'
).transform_window(
rank = "rank()",
sort = [alt.SortField("tfidf", order="descending")],
groupby = ["document"],
)
# heatmap specification
heatmap = base.mark_rect().encode(
color = 'tfidf:Q'
)
# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
color = alt.condition(
alt.FieldOneOfPredicate(field='term', oneOf=term_list),
alt.value('red'),
alt.value('#FFFFFF00')
)
)
# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
text = 'term:N',
color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)
# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)