Combining all csv files from Github Repository Link and make it a one csv file-CodePudding

I want to collect all csv files from the following Github Repository link below and want to make it a new csv file (for data cleaning purpose):

CodePudding user response：

Here you go! You can specify the start and end dates to get all the data from those dates in between them. This also checks if the url for that particular date is present or not, and only if it is a valid url, does it add it to the final data frame.

import requests
import pandas as pd


def is_leap_year(year):
    # checks if the current year is leap year

    """
        params:
            year - int
        
        returns:
            bool
    """

    if((year%4==0 and year%100!=0) or (year%400==0)):
        return True
    else:
        return False


def split_date(date_str):
    # Splits the date into month, day and year

    """
        params:
            date_str - str (mm-dd-yyyy)
        
        returns:
            month - int
            day - int
            year - int
    """

    month, day, year = list(int(x) for x in date_str.split("-")) # For US standards, for rest of the world feel free to swap month and day
    return month, day, year


def generate_dates(start_date, end_date):
    # This doesn't validate the dates and it is assumed that the start_date and end_dates both are valid dates with the end date > start_date
    # This generates all dates bw start date and end date and also takes into account leap year as well

    """
        params:
            start_date - str (mm-dd-yyyy)
            end_date - str (mm-dd-yyyy)
        
        returns:
            dates - list of strings of dates between start_date and end_date
    """

    dates = []
    start_month, start_day, start_year = split_date(start_date)
    end_month, end_day, end_year = split_date(end_date)
    
    year = start_year
    while(year<=end_year):
        month = start_month if(year==start_year) else 1
        max_month = end_month if(year==end_year) else 12
        while(month<=max_month):
            day = start_day if(year==start_year) else 1
            if(month==2):
                max_day = 29 if(is_leap_year(year)) else 28
            else:
                max_day = 31 if(start_month in [1,3,5,7,8,10,12]) else 30
            if(year==end_year and month==end_month):
                max_day = end_day
            while(day<=max_day):
                new_date = f"{month}-{day}-{year}"
                dates.append(new_date)
                day =1
            month =1
        year =1

        return dates


def check_if_url_is_valid(url):
    # This checks if the url is valid through the python requests library, by making a GET request. if the url is present and valid then it returns status code in (200-300)

    """
        params:
            url - str
        
        returns:
            bool
    """

    r = requests.get(url)
    if(r.status_code in range(200,300)):
        return True
    else:
        return False
        

def to_df(base_url, start_date, end_date):
    # Takes all the generated dates, creates a url for each date through the base url and then tries to download it, else prints out an error message

    """
        params:
            base_url - str it should be of the format "https://github.com/{}.csv" where the {} will be used for string formatting and different dates will be put into it

            returns:
                final_df - pd.DataFrame 
    """

    files = []
    dates = generate_dates(start_date, end_date)
    for date in dates:
        url = base_url.format(date)
        valid_url = check_if_url_is_valid(url)
        if(valid_url):
            df = pd.read_csv(url)
            files.append(df)
        else:
            print(f"Could not download {date} data as it may be unavailable")
    final_df = pd.concat(files)
    print(f"\n Downloaded {len(files)} files!\n")
    return final_df

UPDATE:

Here's the Google Colab link for the same - https://colab.research.google.com/drive/19ysmJ2wWaiEpzGae7XqOSPa-FfNZqza3?usp=sharing