This thread was posted on Reddit where they asked people to rank their top five episodes of The Office (USA). I analysed the data to find the overall consensus on the top five episodes.
First, I downloaded the comments and extracted the top-level comments.
# Packages
library(dplyr)
library(tidyr)
library(jsonlite)
# devtools::install_github("dgrtwo/fuzzyjoin")
library(fuzzyjoin)
library(ggplot2)
library(scico)
library(legendry)
# Download comments from Reddit
url <- "https://old.reddit.com/r/DunderMifflin/comments/1r2br1l/.json"
dat <- fromJSON(url, simplifyDataFrame = FALSE)
# Extract comments
top_level <- dat[[2]]$data$children
comments <- do.call(rbind,
lapply(top_level, function(x) {
if (x$kind != "t1") {
return(NULL)
} else {
data.frame(
id = x$data$id,
author = x$data$author,
body = x$data$body
)
}
})
)
Then I had to do some manual re-factoring of the comments to deal with differences in the way people had ranked their top five. I also removed comments where there were no recommendations. But, I kept comments which didn’t give the full five.
Then I cleaned the comments and converted to long format so each row was a recommended episode by a single commenter:
# Clean comments, convert to long-format
comments_clean <- comments %>%
dplyr::select(id, body_clean) %>%
separate_longer_delim(body_clean, ";") %>%
mutate(body_clean = trimws(body_clean)) %>%
filter(body_clean != "") %>%
separate_wider_delim(body_clean, delim = ".", names = c("rank", "ep")) %>%
mutate(
rank = as.numeric(rank),
ep = trimws(tolower(ep)),
ep = case_when(
ep == "the buyout" ~ "broke",
ep == "the mafia" ~ "mafia",
ep == "the beach day" ~ "beach games",
ep == "the return" ~ "travelling salesman & the return",
ep == "the booze cruise" ~ "booze cruise",
ep == "the quiz" ~ "trivia",
ep == "chilli's episode" ~ "the dundies",
ep == "party" ~ "pool party",
ep == "downsize" ~ "the alliance",
TRUE ~ ep))
Then I used fuzzy-matching to line up the recommendations with specific episodes with their season and episode number:
# Read episode titles
episodes <- readLines("./episodes.txt")
# Create dataframe of episode titles
ep_clean <- data.frame(
episode_name = gsub(".*\\s-\\s", "", episodes),
episode_id = gsub("\\s-\\s.*", "", episodes)) %>%
mutate(
episode_name = trimws(tolower(episode_name)),
season = as.numeric(gsub("S0", "", gsub("E.*", "", episode_id))),
episode = as.numeric(gsub(".*E", "", gsub("-.*", "", episode_id))))
# Perform fuzzy matching of episode titles
# Select best match
# fill in episodes that were never recommended
out <- stringdist_join(comments_clean, ep_clean,
by = c("ep" = "episode_name"),
mode = "left",
method = "jw",
max_dist = 99,
distance_col = "dist") %>%
group_by(id, rank) %>%
slice_min(order_by = dist, n = 1) %>%
ungroup() %>%
mutate(rank = factor(rank, levels = 1:5)) %>%
group_by(season, episode, rank) %>%
tally() %>%
right_join(., ep_clean[,c("season", "episode")], by = c("season", "episode")) %>%
mutate(n = ifelse(is.na(n), 0, n))
The top episodes by number of times the episode appeared anywhere in a top five list:
| Episode | N |
|---|---|
| S04E13 – Dinner Party | 47 |
| S05E13 – Stress Relief | 23 |
| S02E12 – The Injury | 17 |
| S01E02 – Diversity Day | 13 |
| S04E18 – Goodbye, Toby | 9 |
Then I made a stacked bar plot of the recommendations for each episode:
# Create bar plot
ggplot(out, aes(x = episode, y = n)) +
geom_col(aes(fill = rank), position = "stack") +
scale_fill_scico_d(name = "Rank", palette = "bamako",
direction = -1, drop = FALSE, na.translate = FALSE) +
facet_wrap(~season, scales = "free_x") +
scale_x_continuous(breaks = 1:max(out$episode)) +
labs(
x = "Season / Episode",
y = "Recommendations") +
theme_bw() +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid.major.x = element_blank())

I used ChatGPT to extract the ratings of each episode of The Office from IMDB. Then compared the IMDB ratings against the total number of times an episode appeared in a top five list:
# Import IMDB ratings
imdb_ratings <- read.csv("./imdb_ratings.csv")
# Combine Reddit top-5s and IMDB ratings
out_imdb <- left_join(out, imdb_ratings, by = c("season", "episode")) %>%
mutate(
n = ifelse(is.na(n), 0, n),
season = as.factor(season))
# Plot comparison of IMDB rating and number of top-5 recommendations
ggplot(out_imdb, aes(x = imdb_rating, y = n)) +
geom_point(aes(colour = season), position = position_jitter(width = 0.02, height = 0), alpha = 0.5) +
scale_colour_scico_d(name = "Season", palette = "batlow") +
theme_bw() +
labs(
x = "IMDB rating",
y = "N recommendations")
