Isabel
2024-05-14
endpoint <- "https://api.nytimes.com/svc/archive/v1/"
year_start <- 2020
month = 1
url <- paste0(endpoint, year_start, "/", month, ".json?api-key=", nyt_key)
response1 <- GET(url)
data1 <- fromJSON(httr::content(response1, "text", encoding = "UTF-8"))
abstracts2020 <-data1$response$docs$abstract
for (month in 2:9)
{
url <- paste0(endpoint, year_start, "/", month, ".json?api-key=", nyt_key)
response <- GET(url)
if(response$status_code == 200)
{
data <- fromJSON(httr::content(response, "text", encoding = "UTF-8"))
abstracts2020 <- c(abstracts2020,data$response$docs$abstract)
}
else
cat("\nError status code in month ", month)
Sys.sleep(5)
}
load("nyt_abstracts.rda")
abstracts2020<- abstracts$abstracts2020
corpus2020 <- Corpus(VectorSource(abstracts2020))
corpus2020 <- tm_map(corpus2020, content_transformer(tolower))
corpus2020 <- tm_map(corpus2020, removeNumbers, upc = TRUE)
corpus2020 <- tm_map(corpus2020, removePunctuation, upc = TRUE)
corpus2020 <- tm_map(corpus2020, removeWords,
c(stopwords("english"),
"new", "york", "times", "'s", "'re", "-", "_", "' s"))
corpus2020 <- tm_map(corpus2020, stripWhitespace)
corpus2020 <- tm_map(corpus2020, stemDocument)