Lecture 11

Isabel

2024-05-14

Libraries and API key

library(httr)
library(jsonlite)
library(tm)
library(SnowballC)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(wordcloud)
nyt_key = "gJwBQ3SHaeWEGvCQP5VNvhWszYSp3s1P"

Make the initial API request in 2020

endpoint <- "https://api.nytimes.com/svc/archive/v1/"
year_start <- 2020
month = 1 
url <- paste0(endpoint, year_start, "/", month, ".json?api-key=", nyt_key)
response1 <- GET(url)
data1 <- fromJSON(httr::content(response1, "text", encoding = "UTF-8"))
abstracts2020 <-data1$response$docs$abstract
for (month in 2:9)
{
  url <- paste0(endpoint, year_start, "/", month, ".json?api-key=", nyt_key)
  response <- GET(url)
  if(response$status_code == 200)
  {
    data <- fromJSON(httr::content(response, "text", encoding = "UTF-8"))
    abstracts2020 <- c(abstracts2020,data$response$docs$abstract)
  }
  else
    cat("\nError status code in month ", month)
  Sys.sleep(5)
}

Preprocess for text mining

load("nyt_abstracts.rda")
abstracts2020<- abstracts$abstracts2020
corpus2020 <- Corpus(VectorSource(abstracts2020))
corpus2020 <- tm_map(corpus2020, content_transformer(tolower))
corpus2020 <- tm_map(corpus2020, removeNumbers, upc = TRUE)
corpus2020 <- tm_map(corpus2020, removePunctuation, upc = TRUE)
corpus2020 <- tm_map(corpus2020, removeWords, 
                 c(stopwords("english"), 
                   "new", "york", "times", "'s", "'re", "-", "_", "' s"))
corpus2020 <- tm_map(corpus2020, stripWhitespace)
corpus2020 <- tm_map(corpus2020, stemDocument)

Identify the most common words

dtm <- DocumentTermMatrix(corpus2020)
freq <- colSums(as.matrix(dtm))
top_words <- head(sort(freq, decreasing = TRUE), 100)