Home > Blockchain >  Splitting a Loop into Subloops
Splitting a Loop into Subloops

Time:09-27

I have this loop in R that is scraping Reddit comments from an API incrementally on a minute basis (e.g. all comments containing a certain keyword between now and 1 minute ago, 1 minute ago and 2 minutes ago, 2 minutes ago and 3 minutes ago, etc.):

library(jsonlite)

part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="    
part2 = "m&before="
part3 = "m&size=500"

results = list()
for (i in 1:526000)
{tryCatch({
    {
        url_i<-  paste0(part1, i 1,  part2, i,  part3)
        r_i <-  data.frame(fromJSON(url_i))
        results[[i]] <- r_i

myvec_i <- sapply(results, NROW)

print(c(i, sum(myvec_i))) 
       
       
    }
}, error = function(e){})
}
final = do.call(rbind.data.frame, results)
saveRDS(final, "final.RDS")

I would like to split this loop up so that it runs in segments of 20000 iterations, saves each segment, and then continues to the next segment.

This is the code I am using:

index = seq(1, 526000, by = 20000)

max = as.numeric(length(index) -1 )

###########################################

results = list()

for (i in 1:max)
    
{tryCatch({
    
    {
        start_i = index[i]
        
        end_i = index[i 1]
        
        
        url_i<-  paste0(part1, end_i,  part2, start_i,  part3)
        
        resource_i <-  data.frame(fromJSON(url_i))
        
        results[[i]] <- r_i
        
        myvec_i <- sapply(results, NROW)
        print(c(i, sum(myvec_i)))
        
    }
    
}, error = function(e){})
    
    final_i = do.call(rbind.data.frame, results)
    title_i = paste0("file_", i, ".RDS")
    saveRDS(final_i, title_i )
    
}

I am not sure if I have done this correctly - can someone please show me how to do this correctly?

Thank you!

CodePudding user response:

If the logic is to loop over each block, get the sequence ('index'), create the 'start' as index and 'end' is 1 subtracted from 1 after removing the first element and appending the 526000 (as index last element is 520001. Loop over the sequence of 'start', extract the corresponding 'start', 'end' from that index (start_i, end_i), then loop over the sequence between the start_i:end_i and apply the code while appending the output to result list

 index <- seq(1, 526000, by = 20000)
  start <- index
  end <- c(index[-1] - 1, 526000)
  
   
  part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="    
  part2 <- "m&before="
  part3 <- "m&size=500"
  results <- list()
   
for(i in seq_along(start)) {
    
   
    
           start_i <- start[i]
           end_i <- end[i]
           for(j in start_i:end_i) {  
             tryCatch({         
        
        
                   url_j<-  paste0(part1, j 1,  part2, j,  part3)
        
                    resource_j <-  data.frame(fromJSON(url_j))
        
                    results[[j]] <- resource_j
        
                    myvec_j <- sapply(results, NROW)
                    print(c(j, sum(myvec_j)))
                
                
    
                }, error = function(e){
              })
    
                final_j = do.call(rbind.data.frame, results)
                title_j = paste0("file_", j, ".RDS")
                saveRDS(final_j, title_j )
    
                }
                
    }

CodePudding user response:

Based on the comment provided by @akrun:

library(jsonlite)

part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="

part2 = "m&before="

part3 = "m&size=500"

index = seq(1, 526000, by = 20000)

max = as.numeric(length(index) -1 )

start <- index[-length(index)]

end <- index[-1] - 1

###########################################

results = list()

for(i in seq_along(start)) {
    
    {tryCatch({
        
        {
            
            start_i <- start[i]
            end_i <- end[i]
            
            
            url_i<-  paste0(part1, end_i,  part2, start_i,  part3)
            
            resource_i <-  data.frame(fromJSON(url_i))
            
            results[[i]] <- r_i
            
            myvec_i <- sapply(results, NROW)
            print(c(i, sum(myvec_i)))
            
        }
        
    }, error = function(e){})
        
        final_i = do.call(rbind.data.frame, results)
        title_i = paste0("file_", i, ".RDS")
        saveRDS(final_i, title_i )
        
    }
    
    

@akrun: Is this what you meant? Thank you!

  • Related