Splitting a Loop into Subloops-CodePudding

I have this loop in R that is scraping Reddit comments from an API incrementally on a minute basis (e.g. all comments containing a certain keyword between now and 1 minute ago, 1 minute ago and 2 minutes ago, 2 minutes ago and 3 minutes ago, etc.):

library(jsonlite)

part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="    
part2 = "m&before="
part3 = "m&size=500"

results = list()
for (i in 1:526000)
{tryCatch({
    {
        url_i<-  paste0(part1, i 1,  part2, i,  part3)
        r_i <-  data.frame(fromJSON(url_i))
        results[[i]] <- r_i

myvec_i <- sapply(results, NROW)

print(c(i, sum(myvec_i))) 
       
       
    }
}, error = function(e){})
}
final = do.call(rbind.data.frame, results)
saveRDS(final, "final.RDS")

I would like to split this loop up so that it runs in segments of 20000 iterations, saves each segment, and then continues to the next segment.

This is the code I am using:

index = seq(1, 526000, by = 20000)

max = as.numeric(length(index) -1 )

###########################################

results = list()

for (i in 1:max)
    
{tryCatch({
    
    {
        start_i = index[i]
        
        end_i = index[i 1]
        
        
        url_i<-  paste0(part1, end_i,  part2, start_i,  part3)
        
        resource_i <-  data.frame(fromJSON(url_i))
        
        results[[i]] <- r_i
        
        myvec_i <- sapply(results, NROW)
        print(c(i, sum(myvec_i)))
        
    }
    
}, error = function(e){})
    
    final_i = do.call(rbind.data.frame, results)
    title_i = paste0("file_", i, ".RDS")
    saveRDS(final_i, title_i )
    
}

I am not sure if I have done this correctly - can someone please show me how to do this correctly?

Thank you!

CodePudding user response：

If the logic is to loop over each block, get the sequence ('index'), create the 'start' as index and 'end' is 1 subtracted from 1 after removing the first element and appending the 526000 (as index last element is 520001. Loop over the sequence of 'start', extract the corresponding 'start', 'end' from that index (start_i, end_i), then loop over the sequence between the start_i:end_i and apply the code while appending the output to result list

 index <- seq(1, 526000, by = 20000)
  start <- index
  end <- c(index[-1] - 1, 526000)
  
   
  part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="    
  part2 <- "m&before="
  part3 <- "m&size=500"
  results <- list()
   
for(i in seq_along(start)) {
    
   
    
           start_i <- start[i]
           end_i <- end[i]
           for(j in start_i:end_i) {  
             tryCatch({         
        
        
                   url_j<-  paste0(part1, j 1,  part2, j,  part3)
        
                    resource_j <-  data.frame(fromJSON(url_j))
        
                    results[[j]] <- resource_j
        
                    myvec_j <- sapply(results, NROW)
                    print(c(j, sum(myvec_j)))
                
                
    
                }, error = function(e){
              })
    
                final_j = do.call(rbind.data.frame, results)
                title_j = paste0("file_", j, ".RDS")
                saveRDS(final_j, title_j )
    
                }
                
    }

CodePudding user response：

Based on the comment provided by @akrun:

library(jsonlite)

part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="

part2 = "m&before="

part3 = "m&size=500"

index = seq(1, 526000, by = 20000)

max = as.numeric(length(index) -1 )

start <- index[-length(index)]

end <- index[-1] - 1

###########################################

results = list()

for(i in seq_along(start)) {
    
    {tryCatch({
        
        {
            
            start_i <- start[i]
            end_i <- end[i]
            
            
            url_i<-  paste0(part1, end_i,  part2, start_i,  part3)
            
            resource_i <-  data.frame(fromJSON(url_i))
            
            results[[i]] <- r_i
            
            myvec_i <- sapply(results, NROW)
            print(c(i, sum(myvec_i)))
            
        }
        
    }, error = function(e){})
        
        final_i = do.call(rbind.data.frame, results)
        title_i = paste0("file_", i, ".RDS")
        saveRDS(final_i, title_i )
        
    }

@akrun: Is this what you meant? Thank you!