How to write a python program to calculate the deciles, percentiles, and quantiles for a given data-CodePudding

I am writing a python program to perform a set of steps as outlined below:

# Author: Evan Gertis
# Date  : 10/22
# program : quantile decile calculator
import csv
import logging 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: read csv
testScoresCSV       = open('test_scores.csv')
testScoresDF      = pd.read_csv(testScoresCSV)
# testScoresDF        = pd.DataFrame(testScoresData)
# testScoresData      = map(int,testScoresData)
# testScoresList      = list(testScoresData)
print(testScoresDF.head())

# Step 3: use numpy to determine Q1, Q2, Q3
quantiles = np.quantile(testScoresDF, q=[0.25, 0.5, 0.75])
logging.debug(f"{quantiles}")

# Step 4: repeat step 3 with deciles
deciles = np.quantile(testScoresDF,  q=[0.1,1,0.1])
logging.debug(f"{deciles}")

# Step 5: repeat step 3 with percentiles
percentiles = np.percentile(testScoresDF, q=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
logging.debug(f"{percentiles}")

# Step 6: plot the results
N_points = len(testScoresDF)
logging.debug(f"N_points:{N_points}")
n_bins = 20

# Create a random number generator with a fixed seed for reproducibility
rng = np.random.default_rng(19680801)

# Generate two normal distributions
dist1 = rng.standard_normal(N_points)
dist2 = 0.4 * rng.standard_normal(N_points)   5

fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)

# We can set the number of bins with the *bins* keyword argument.
axs[0].hist(dist1, bins=n_bins)
axs[1].hist(dist2, bins=n_bins)

# plt.show()

Expected:

This program should return the expected output for the quantiles Q1-Q3, deciles D1-D10, and percentiles and a plot of the distribution.

Actual:

   Test_Scores
0           88
1           45
2           53
3           86
4           33
2022-10-22 14:30:48,381 - DEBUG - [37.75 57.   76.  ]
2022-10-22 14:30:48,381 - DEBUG - [30.9 99.  30.9]
2022-10-22 14:30:48,381 - DEBUG - [25.177 25.354 25.531 25.708 25.885 26.062 26.239 26.416 26.593 26.77 ]
2022-10-22 14:30:48,382 - DEBUG - N_points:60

Any help with this would be greatly appreciated. Thank you!

CodePudding user response：

I suppose, that your csv contains float values and what you want to achieve is to convert every element to integer. Here is a working example, without the csv.

import numpy as np
import pandas as pd

# Step 1: read data
testScoresData      = pd.DataFrame([2.2,3.3,83.2,4,2,5,1,2,3,2],columns=["score"])
testScores          = list(map(lambda x: int(x),testScoresData["score"]))
print(testScores)

# # Step 3: use numpy to determine Q1, Q2, Q3
quantiles = np.quantile(testScores, q=[0.25, 0.5, 0.75])
print(quantiles)

As mentioned in the comments, you don't need the csv package. With pandas you can just use read_csv

testScoresData      = pd.read_csv("yourdata.csv")

CodePudding user response：

Here is a solution to the problem verified with proper plotting:

# Author: Evan Gertis
# Date  : 10/22
# program : quantile decile calculator
import csv
import logging 
import coloredlogs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import PercentFormatter
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: read csv
testScoresCSV       = open('test_scores.csv')
testScoresDF      = pd.read_csv(testScoresCSV)
# testScoresDF        = pd.DataFrame(testScoresData)
# testScoresData      = map(int,testScoresData)
# testScoresList      = list(testScoresData)
print(testScoresDF.head())

# Step 3: use numpy to determine Q1, Q2, Q3
quantiles = np.quantile(testScoresDF, q=[0.25, 0.5, 0.75])
logging.debug(f"{quantiles}")

# Step 4: repeat step 3 with deciles
deciles = np.quantile(testScoresDF,  q=[0.1,1,0.1])
logging.debug(f"{deciles}")

# Step 5: repeat step 3 with percentiles
percentiles = np.percentile(testScoresDF, q=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
logging.debug(f"{percentiles}")

# Step 6: plot the results

data = testScoresDF

fig, axs = plt.subplots(2, 3)

# basic plot
logging.debug("Step 2: executing basic plot")
axs[0, 0].boxplot(data)
axs[0, 0].set_title('basic plot')

# notched plot
logging.debug("Step 3: executing notched plot")
axs[0, 1].boxplot(data, 1)
axs[0, 1].set_title('notched plot')

# change outlier point symbols
logging.debug("Step 4: change outlier point symbols")
axs[0, 2].boxplot(data, 0, 'gD')
axs[0, 2].set_title('change outlier\npoint symbols')

# don't show outlier points
logging.debug("Step 5: don't show outlier points")
axs[1, 0].boxplot(data, 0, '')
axs[1, 0].set_title("don't show\noutlier points")

# horizontal boxes
logging.debug("Step 6: add horizontal boxes")
axs[1, 1].boxplot(data, 0, 'rs', 0)
axs[1, 1].set_title('horizontal boxes')

# change whisker length
logging.debug("Step 7: change whisker length")
axs[1, 2].boxplot(data, 0, 'rs', 0, 0.75)
axs[1, 2].set_title('change whisker length')

fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
                    hspace=0.4, wspace=0.3)

# Multiple box plots on one Axes
logging.debug("Step 9: show Multiple box plots on one Axes")
fig, ax = plt.subplots()
ax.boxplot(data)

logging.debug("Step 10: show results")
plt.show()
# Author: Evan Gertis
# Date  : 10/22
# program : quantile decile calculator
import csv
import logging 
import coloredlogs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import PercentFormatter
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: read csv
testScoresCSV       = open('test_scores.csv')
testScoresDF      = pd.read_csv(testScoresCSV)
# testScoresDF        = pd.DataFrame(testScoresData)
# testScoresData      = map(int,testScoresData)
# testScoresList      = list(testScoresData)
print(testScoresDF.head())

# Step 3: use numpy to determine Q1, Q2, Q3
quantiles = np.quantile(testScoresDF, q=[0.25, 0.5, 0.75])
logging.debug(f"{quantiles}")

# Step 4: repeat step 3 with deciles
deciles = np.quantile(testScoresDF,  q=[0.1,1,0.1])
logging.debug(f"{deciles}")

# Step 5: repeat step 3 with percentiles
percentiles = np.percentile(testScoresDF, q=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
logging.debug(f"{percentiles}")

# Step 6: plot the results

data = testScoresDF

fig, axs = plt.subplots(2, 3)

# basic plot
logging.debug("Step 2: executing basic plot")
axs[0, 0].boxplot(data)
axs[0, 0].set_title('basic plot')

# notched plot
logging.debug("Step 3: executing notched plot")
axs[0, 1].boxplot(data, 1)
axs[0, 1].set_title('notched plot')

# change outlier point symbols
logging.debug("Step 4: change outlier point symbols")
axs[0, 2].boxplot(data, 0, 'gD')
axs[0, 2].set_title('change outlier\npoint symbols')

# don't show outlier points
logging.debug("Step 5: don't show outlier points")
axs[1, 0].boxplot(data, 0, '')
axs[1, 0].set_title("don't show\noutlier points")

# horizontal boxes
logging.debug("Step 6: add horizontal boxes")
axs[1, 1].boxplot(data, 0, 'rs', 0)
axs[1, 1].set_title('horizontal boxes')

# change whisker length
logging.debug("Step 7: change whisker length")
axs[1, 2].boxplot(data, 0, 'rs', 0, 0.75)
axs[1, 2].set_title('change whisker length')

fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
                    hspace=0.4, wspace=0.3)

# Multiple box plots on one Axes
logging.debug("Step 9: show Multiple box plots on one Axes")
fig, ax = plt.subplots()
ax.boxplot(data)

logging.debug("Step 10: show results")
plt.show()