Home > Software engineering >  How count duplicates in one file and output it beautiful Python
How count duplicates in one file and output it beautiful Python

Time:07-14

I have data with 3 columns. I need to compare this fields and if they are similar count them. If they are not similar then output information. I need to get thisI need to get this

run.py

from create_excel import create_excel, settings_merge, create_header
from functions import separate_first_raw, separate_and_check_all_fields

input_file = "test.csv"
output_file = "test.xlsx"

my_file = open(input_file, "r", encoding="utf-8")
workbook = create_excel(output_file)
worksheet = workbook.add_worksheet()
merge_format = settings_merge(workbook)

raw = 1
column = 0
test = 0
raw_domain = 1
raw_site = 1
sum_category = 1
raw_category = 1
first = "true"

create_header(worksheet)

if my_file:
    current_line = my_file.readline()

    first_line_value = current_line.split(",")
    first_line_id = first_line_value[0]
    first_line_domain = first_line_value[1]
    first_line_value = first_line_value[2]

    # worksheet.write(1, 0, first_line_id)
    # worksheet.write(1, 1, first_line_domain)
    # worksheet.write(1, 4, first_line_value)
    raw = 2

    for line in my_file:
        previous_line = current_line
        print(f'previous_line {previous_line}')
        current_line = line
        print(f'current {current_line}')
        # print(f"Previous {previous_line}")
        # print(f"Current {current_line}")

        domain = separate_first_raw(previous_line, current_line, first)
        site = separate_and_check_all_fields(previous_line, current_line, 1, first)
        category = separate_and_check_all_fields(previous_line, current_line, 2, first)
#
#         # for i in domain:
#         #     print(i)
#         #     if i != 0:
#         #         worksheet.merge_range(raw_domain, 0, raw, 0, i[0], merge_format)
#         #         print(f"Id: {i[0]}  {raw_domain} and {raw}")
#         #         raw_domain = raw   1
#         # for i in site:
#         #     if i != 0:
#         #         worksheet.write(raw, 1, i[1], merge_format)
#         #         raw_site = raw   1
        for i in category:
            if i != 0:
                print(i)
                if first == "true":
                    worksheet.write(raw, 3, first_line_value, merge_format)
                    worksheet.write(raw, 2, sum_category, merge_format)
                else:
                    worksheet.write(raw, 3, i[2], merge_format)
                    worksheet.write(raw, 2, sum_category, merge_format)
                raw_category = raw   1
                sum_category = 1
                raw  = 1
                first = "false"
            else:
                sum_category  = 1
# else:
#     print("Please input file")


workbook.close()

functions.py

def separate_first_raw(previous_line, current_line, test):
    something = []

    previous_list = previous_line.split(",")
    current_list = current_line.split(",")
    # if test == "true":
    #     something.append(previous_list)
    # else:
    if previous_list[0] != current_list[0]:
        something.append(current_list)
    else:
        something.append(0)

    return something


def separate_and_check_all_fields(previous_line, current_line, field, test):
    something = []

    previous_list = previous_line.split(",")
    current_list = current_line.split(",")
    if test == "true":
        something.append(previous_list)
    else:
        if previous_list[field] != current_list[field] or \
                previous_list[1] != current_list[1] or previous_list[0] != current_list[0]:
            something.append(current_list)
        else:
            something.append(0)

    return something

create_excel.py

import xlsxwriter


def create_excel(filename):
    workbook = xlsxwriter.Workbook(filename)

    return workbook


def create_header(worksheet):
    worksheet.write(0, 0, "id")
    worksheet.write(0, 1, "domain")
    worksheet.write(0, 2, "firewall")
    worksheet.write(0, 3, "category")


def settings_merge(workbook):
    merge_format = workbook.add_format({
        'bold': 1,
        'border': 1,
        'align': 'center',
        'valign': 'vcenter'})

    return merge_format

CodePudding user response:

Here is a simple way using enter image description here

  • Related