Home > Enterprise >  valid and corrupt lines from file
valid and corrupt lines from file

Time:10-28

import os
import sys

valid_lines = []
corrupt_lines = []

'''
The validate_data function will check the students.csv line by line for corrupt data.

- Valid lines should be added to the valid_lines list.
- Invalid lines should be added to the corrupt_lines list.

Example input: 0896801,Kari,Wilmore,1970-06-18,INF
This data is valid and the line should be added to the valid_lines list unchanged.

Example input: 0773226,Junette,Gur_ry,1995-12-05,
This data is invalid and the line should be added to the corrupt_lines list in the following format:

0773226,Junette,Gur_ry,1995-12-05, => INVALID DATA: ['0773226', 'Gur_ry', '']

In the above example the studentnumber does not start with '08' or '09', 
the last name contains a special character and the student program is empty.

Don't forget to put the students.csv file in the same location as this file!
'''


def validate_data(line):
    # TYPE YOUR SOLUTION CODE HERE
    #CSV STUDENT NUMBER
    student_number_valid = True
    index = 0
    studentnumber, firstname, lastname, birthdate, studyprogram = line.split(",")
    for element in line.split(","):
        if index == 0:
            student_number = element

    if len(studentnumber) > 0:
        csv_s_n = studentnumber[0]
        csv_s_n2 = studentnumber[1]
        if csv_s_n == '0' and csv_s_n2 == '8' or '9':
            valid_lines.append(studentnumber)
        else:
            corrupt_lines.append(studentnumber)
    else:
        corrupt_lines.append(studentnumber)

    #CSV NAME
    if len(firstname) > 0:
        if firstname.isalpha() == True:
            valid_lines.append(firstname)
        else:
            corrupt_lines.append(firstname)
    else:
        corrupt_lines.append(firstname)

    if len(lastname) > 0:
        if lastname.isalpha() == True:
            valid_lines.append(lastname)
        else:
            corrupt_lines.append(lastname)
    else:
        corrupt_lines.append(lastname)

    #CSV BIRTHDAY
    if len(birthdate.split()) == 3:
        year1, month1, day1 = birthdate.split("-")
        year1=int(year1)
        valid_months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
        valid_days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10","11", "12", "13", 
        "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]
        thirty_day_months = ["04", "06", "09", "11"]
        if not year1 in range(1960, 2004   1):
            birthdate = (f"{year1}-{month1}-{day1}")
            corrupt_lines.append(birthdate)
        if not month1 in valid_months:
            birthdate = (f"{year1}-{month1}-{day1}")
            corrupt_lines.append(birthdate)
        if month1 == "02":
            valid_days.remove("31")
            valid_days.remove("30")
            valid_days.remove("29")
            if not day1 in valid_days:
                birthdate = (f"{year1}-{month1}-{day1}")
                corrupt_lines.append(birthdate)
                # validate day for february
        if month1 in thirty_day_months:
            valid_days.remove("31")
            if not day1 in valid_days:
                birthdate = (f"{year1}-{month1}-{day1}")
                corrupt_lines.append(birthdate)
                # validate day for 30-day months
        else:
            birthdate = (f"{year1}-{month1}-{day1}")
            valid_lines.append(birthdate)
    else:
        corrupt_lines.append(birthdate)
            
    #CSV STUDYPROGRAM
    if len(studyprogram) > 1:
        if studyprogram == "INF" or "TINF" or "CMD" or "AI":
            valid_lines.append(studyprogram)
        else:
            corrupt_lines.append(studyprogram)
    else:
        corrupt_lines.append(studyprogram)


def main(csv_file):
    with open(os.path.join(sys.path[0], csv_file), newline='') as csv_file:
        # skip header line
        next(csv_file)

        for line in csv_file:
            validate_data(line.strip())

        print('### VALID LINES ###')
        print("\n".join(valid_lines))
        print('### CORRUPT LINES ###')
        print("\n".join(corrupt_lines))


if __name__ == "__main__":
    main('students.csv')

As you can read, the function validate_data should check the imported file for corrupt and valid lines, then append them to the correct list, and print them. It works, except that, as you can probably see, the lines will not print in a single line.

I'm sure I have to make two other lists to append the correct data into a single line, doing the same with the corrupt data, but when I try it fails.

CodePudding user response:

You're adding the line to each list depending on each validation. So you'll add it to each list multiple times, depending on which validations succeed and fail.

You should only add it to one list or the other -- if it fails any validation, add it to corrupt_list, and only add to valid_list if all validations succeed.

The simple way to do this is to add to corrupt_list when each validation fails, and then return from the function. If you make it to the end of all the validations, add to valid_list.

You shouldn't be appending individual fields to valid_lines and corrupt_lines, they're supposed to contain the entire line.

csv_s_n2 == '8' or '9' is not the correct what to test if a variable is equvale to either value. See Why does "a == x or y or z" always evaluate to True? How can I compare "a" to all of those?

def validate_data(line):
    # TYPE YOUR SOLUTION CODE HERE
    #CSV STUDENT NUMBER
    student_number_valid = True
    index = 0
    studentnumber, firstname, lastname, birthdate, studyprogram = line.split(",")

    if len(studentnumber) > 0:
        csv_s = studentnumber[0:2]
        if csv_s not in ('08', '09'))
            corrupt_lines.append(line)
            return
    else:
        corrupt_lines.append(line)
        return

    #CSV NAME
    if len(firstname) > 0:
        if not firstname.isalpha():
            corrupt_lines.append(line)
            return
    else:
        corrupt_lines.append(line)
        return

    if len(lastname) > 0:
        if not lastname.isalpha():
            corrupt_lines.append(line)
            return
    else:
        corrupt_lines.append(line)
        return

    #CSV BIRTHDAY
    if len(birthdate.split('-')) == 3:
        year1, month1, day1 = birthdate.split("-")
        year1=int(year1)
        valid_months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
        valid_days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10","11", "12", "13", 
                      "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]
        thirty_day_months = ["04", "06", "09", "11"]
        if not year1 in range(1960, 2004   1):
            corrupt_lines.append(line)
            return
        if not month1 in valid_months:
            corrupt_lines.append(line)
            return
        if month1 == "02":
            # validate day for february
            valid_days.remove("31")
            valid_days.remove("30")
            valid_days.remove("29")
        elif month1 in thirty_day_months:
            # validate day for 30-day months
            valid_days.remove("31")
        if not day1 in valid_days:
            corrupt_lines.append(line)
            return
    else:
        corrupt_lines.append(line)
        return
            
    #CSV STUDYPROGRAM
    if studyprogram not in ("INF", "TINF", "CMD", "AI"):
        corrupt_lines.append(line)
        return

    valid_lines.append(line)
  • Related