Find the most common value in the column of a CSV file (without pandas)-CodePudding

I am trying to find the most common value located within a column in a CSV file, and return that value.

I am only allowed to import the file that I'm using.

I cannot use pandas.

Here is what I tried to do:

def get_longest_value_from_col(filename, column_name):
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        counts = {}
        num = filename(column_name)
        for i in filename:
            curr_frequency = filename.count(i)
            if curr_frequency > counter:
                counter = curr_frequency
                num = i
        return num

print(str(
    get_longest_value_from_col(
        filename='personal_data.csv',
        column_name='the_location'
    )
))

CodePudding user response：

As a simple solution, you can update your function, as below:

import csv

def get_longest_value_from_col(filename, column_name):
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        column_values = (row[column_name] for row in reader)
        return max(column_values, key=column_values.count)

print(
    get_longest_value_from_col('personal_data.csv', 'the_location')
)

Or we can follow, what uozcan12 said, therefore we are able to use generators as a more efficient iterable (python docs):

import csv  
from collections import Counter

def get_longest_value_from_col(filename, column_name):
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        column_values = (row[column_name] for row in reader)
        return Counter(column_values).most_common(1)[0][0]

print(
    get_longest_value_from_col('personal_data.csv', 'the_location')
)

CodePudding user response：

If you don't want to import anything then:

COLUMN = 'the_location'
SEP = ','

def get_columns(line):
    return list(map(str.strip, line.split(SEP)))

def get_longest_value_from_col(filename):
    with open(filename) as csv:
        columns = get_columns(next(csv)) # header columns
        try:
            column_index = columns.index(COLUMN)
            data = {}
            for line in csv:
                cdata = get_columns(line)[column_index]
                data[cdata] = data.get(cdata, 0)   1
            return max(data.items(), key=lambda x: x[1])[0]
        except ValueError:
            print(f"Couldn't find column {COLUMN}")
        

print(get_longest_value_from_col('personal_data.csv'))