from statistics import mean
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')

from sklearn import datasets


def create_dataset(hm,variance,step=2,correlation=False):
    val = 1
    ys = []
    for i in range(hm):
        y = val + random.randrange(-variance,variance)
        ys.append(y)
        if correlation and correlation == 'pos':
            val+=step
        elif correlation and correlation == 'neg':
            val-=step

    xs = [i for i in range(len(ys))]
    
    return np.array(xs, dtype=np.float64),np.array(ys,dtype=np.float64)

def best_fit_slope_and_intercept(xs,ys):
    m = (((mean(xs)*mean(ys)) - mean(xs*ys)) /
         ((mean(xs)*mean(xs)) - mean(xs*xs)))
    
    b = mean(ys) - m*mean(xs)

    return m, b


def coefficient_of_determination(ys_orig,ys_line):
    y_mean_line = [mean(ys_orig) for y in ys_orig]
    squared_error_regr = sum((ys_line - ys_orig) * (ys_line - ys_orig))
    squared_error_y_mean = sum((y_mean_line - ys_orig) * (y_mean_line - ys_orig))
    r_squared = 1 - (squared_error_regr/squared_error_y_mean)

    return r_squared


xs1, ys1 = [6,7,8],[5,7,6]
xs2, ys2 = [1,2,3],[2,3,1]


plt.scatter(xs1,ys1,c='r',marker='+',s=150, linewidth=5)
plt.scatter(xs2,ys2,c='k',marker='o',s=110)


plt.show()


##sample_data = datasets.make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.5, center_box=(-10.0, 10.0), shuffle=True, random_state=None)
##print(sample_data[0])
##plt.scatter(sample_data[0][:,0],sample_data[0][:,1])
##plt.show()


##xs, ys = create_dataset(40,10,2,correlation='pos')
##m, b = best_fit_slope_and_intercept(xs,ys)
##regression_line = [(m*x)+b for x in xs]
##r_squared = coefficient_of_determination(ys,regression_line)
##print(r_squared)
##
##plt.scatter(xs,ys,color='#003F72', label = 'data')
##plt.plot(xs, regression_line, label = 'regression line')
##plt.legend(loc=4)
##plt.show()