I don't know how to compute the average r squared with individual stock return and market return
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
df1 = pd.read_csv(r'C:\Users\USER\Desktop\股價資料.csv')
pd.DataFrame(df1)
model = LinearRegression()
x1 = df1[['NAME OF COMPANY']]
y1 = df1[['INDIVIDUAL COMPANY RETURN']]
y2= df1[['MARKET RETURN']]
z1= df1[['YEAR']]
# print(x1,z1)
list1=[]
list2=[]
list3=[]
list4=[]
i=0
for i in range(379539): #total number of company's data is 379539
if x1.values[i]!=x1.values[i 1] or z1.values[i]!=z1.values[i 1]:
list1.append(x1.index[i 1])
y1.values[list1[i]:list1[i 1]]
# lm.fit(list3, list4)
# r_squared = lm.score(list3,list4)
# list2.append(r_squared)
# lm.fit(list3, list4)
# r_squared = lm.score(list3,list4)
# # print(r_squared)
This is what my code looks like now, I've tried to separate data of y1,y2 to run the r square with the value append in list1
CodePudding user response:
Formula of R squared:
1-unexplained_variation/total_variation
Unexplained variation
is the sum of difference for each datapoint between the prediction using the line of best fit and the actual values. You can compute the coefficients of the line of best fit with numpy.polyfit()
.
Total variation
is the sum of difference for each datapoint between the average value and the actual values.
EDIT: With dummy values, it would look something like this
import numpy as np
x = [2000, 2001, 2002, 2003, 2004]
y = [50000, 10000, 20000, 30000, 5000]
def get_unexplained_variation(xs, ys):
a,b = np.polyfit(xs, ys, 1)
var = 0
for x,y in zip(xs, ys):
var = (a*x b - y)**2
return var
def get_total_variation(xs, ys):
avg = np.mean(ys)
var = 0
for y in ys:
var = (y - avg) ** 2
return var
unexplained_variation = get_unexplained_variation(x, y)
total_variation = get_total_variation(x, y)
print(1 - unexplained_variation/total_variation)