Home > Software design >  Getting row counts from Redshift during unload process and counting rows loaded in S3
Getting row counts from Redshift during unload process and counting rows loaded in S3

Time:05-21

My python code looks like below where I am unloading data from Redshift to Amazon S3 bucket. I am trying to get row count from Redshift and S3 bucket to ensure that all the data is loaded. Additionally, I would also like to get last uploaded date from S3 bucket so that I know when last unload was performed. Kindly suggest the code with explanation.

Thanks in advance for your time and efforts!

import csv
import redshift_connector
import sys

CSV_FILE="Tables.csv"
CSV_DELIMITER=';'
S3_DEST_PATH="s3://..../"
DB_HOST="MY HOST"
DB_PORT=1234
DB_DB="MYDB"
DB_USER="MY_READ"
DB_PASSWORD="MY_PSWD"
IM_ROLE="arn:aws:iam::/redshift-role/unload data","arn:aws::iam::/write in bucket"

def get_tables(path):
    tables=[]
    with open (path, 'r') as file:
        csv_reader = csv.reader (file,delimiter=CSV_DELIMITER)
        header = next(csv_reader)
        if header != None:
            for row in csv_reader:
                tables.append(row)
    return tables

def unload(conn, tables, s3_path):
    cur = conn.cursor()
    
    for table in tables:
        print(f">{table[0]}.{table[1]}")
        try:
            query= f'''unload('select * from {table[0]}.{table[1]}' to '{s3_path}/{table[1]}/'
            iam_role '{IAM_ROLE}'
            CSV
            PARALLEL FALSE
            CLEANPATH;'''
            print(f"loading in progress")
            cur.execute(query)
            print(f"Done.")
        except Esception as e:
            print("Failed to load")
            print(str(e))
            sys.exit(1)
            
        cur.close()
        

def main():
    try:
        conn = redshift_connector.connect(
        host=DB_HOST,
        port=DB_PORT,
        database= DB_DB,
        user= DB_USER,
        password=DB_PASSWORD
        )
        
        tables = get_tables(CSV_FILE)
        unload(conn,tables,S3_DEST_PATH)
        conn.close()
    except Exception as e:
        print(e)
        sys.exit(1)

Update code based on SO User's comment

tables=['schema1.tablename','schema2.table2']

conn=redshift_connector.connect(
    host='my_host',
    port= "my_port",
database='my_db'
user="user"
password='password')

cur=conn.cursor()
cur.execute ('select count(*) from {',' .join("'" y "'" for y in tables)}')
results=cur.fetchall()
print("The table {} contained".format(tables[0]),*result[0],"rows" "\n" ) #Printing row counts along with table names

cur.close()
conn.close()

2nd Update:

tables=['schema1.tablename','schema2.table2']

conn=redshift_connector.connect(
    host='my_host',
    port= "my_port",
database='my_db'
user="user"
password='password')

cur=conn.cursor()
for table in tables:
        cur.execute(f'select count(*) from {table};')
       results=cur.fetchone()
      for row in result:
            print("The table {} contained".format(tables[0]),result[0],"rows" "\n" ) #Printing row counts along with table names

CodePudding user response:

The simple query to get number of rows is

query = "select count(*) from {table_name}"

For Redshift, all you need to do is

cur.execute(query)
row_count = cur.fetchall()

Using boto3, you can use a similar SQL query to fetch S3 row count as well, as elucidated in this answer.

Edit:
Corrected your updated approach a little:

cur=conn.cursor()
for table in tables:
    cur.execute(f'select count(*) from {table};')
    result=cur.fetchone()
    count = result[0] if result else 0
    print(f"The table {table} contained {count} rows.\n" )
  • Related