I have this loop in R that is scraping Reddit comments from an API incrementally on a minute basis (e.g. all comments containing a certain keyword between now and 1 minute ago, 1 minute ago and 2 minutes ago, 2 minutes ago and 3 minutes ago, etc.):
library(jsonlite)
part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 = "m&before="
part3 = "m&size=500"
results = list()
for (i in 1:526000)
{tryCatch({
{
url_i<- paste0(part1, i 1, part2, i, part3)
r_i <- data.frame(fromJSON(url_i))
results[[i]] <- r_i
myvec_i <- sapply(results, NROW)
print(c(i, sum(myvec_i)))
}
}, error = function(e){})
}
final = do.call(rbind.data.frame, results)
saveRDS(final, "final.RDS")
I would like to split this loop up so that it runs in segments of 20000 iterations, saves each segment, and then continues to the next segment.
This is the code I am using:
index = seq(1, 526000, by = 20000)
max = as.numeric(length(index) -1 )
###########################################
results = list()
for (i in 1:max)
{tryCatch({
{
start_i = index[i]
end_i = index[i 1]
url_i<- paste0(part1, end_i, part2, start_i, part3)
resource_i <- data.frame(fromJSON(url_i))
results[[i]] <- r_i
myvec_i <- sapply(results, NROW)
print(c(i, sum(myvec_i)))
}
}, error = function(e){})
final_i = do.call(rbind.data.frame, results)
title_i = paste0("file_", i, ".RDS")
saveRDS(final_i, title_i )
}
I am not sure if I have done this correctly - can someone please show me how to do this correctly?
Thank you!
CodePudding user response:
If the logic is to loop over each block, get the sequence ('index'), create the 'start' as index and 'end' is 1 subtracted from 1 after removing the first element and appending the 526000 (as index last element is 520001. Loop over the sequence of 'start', extract the corresponding 'start', 'end' from that index (start_i
, end_i
), then loop over the sequence between the start_i:end_i
and apply the code while appending the output to result
list
index <- seq(1, 526000, by = 20000)
start <- index
end <- c(index[-1] - 1, 526000)
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "m&before="
part3 <- "m&size=500"
results <- list()
for(i in seq_along(start)) {
start_i <- start[i]
end_i <- end[i]
for(j in start_i:end_i) {
tryCatch({
url_j<- paste0(part1, j 1, part2, j, part3)
resource_j <- data.frame(fromJSON(url_j))
results[[j]] <- resource_j
myvec_j <- sapply(results, NROW)
print(c(j, sum(myvec_j)))
}, error = function(e){
})
final_j = do.call(rbind.data.frame, results)
title_j = paste0("file_", j, ".RDS")
saveRDS(final_j, title_j )
}
}
CodePudding user response:
Based on the comment provided by @akrun:
library(jsonlite)
part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 = "m&before="
part3 = "m&size=500"
index = seq(1, 526000, by = 20000)
max = as.numeric(length(index) -1 )
start <- index[-length(index)]
end <- index[-1] - 1
###########################################
results = list()
for(i in seq_along(start)) {
{tryCatch({
{
start_i <- start[i]
end_i <- end[i]
url_i<- paste0(part1, end_i, part2, start_i, part3)
resource_i <- data.frame(fromJSON(url_i))
results[[i]] <- r_i
myvec_i <- sapply(results, NROW)
print(c(i, sum(myvec_i)))
}
}, error = function(e){})
final_i = do.call(rbind.data.frame, results)
title_i = paste0("file_", i, ".RDS")
saveRDS(final_i, title_i )
}
@akrun: Is this what you meant? Thank you!