import mysql.connector
import pandas as pd
mydb = mysql.connector.connect(
    host="localhost",
    port="3317",
    user="root",
    password="07SQLDbaze2021",)
d = pd.read_sql('''
        select d.carat, d.cut, cu.cut_rating, d.clarity, c.clarity_rating, d.color, s.color_rating, d.price, round(d.price / d.carat, 0) as price_by_carat, d.depth, d.table, d.x, d.y, d.z
        from sql.diamonds d
        JOIN sql.clarity c ON c.clarity=d.clarity
        JOIN sql.color s ON s.color=d.color
        JOIN sql.cut cu ON  cu.cut=d.cut
        where (d.carat != 0) and (d.price != 0) and (d.table != 0) and (d.depth != 0) and (d.x != 0) and (d.y != 0) and (d.z != 0);''', con=mydb)
d


def categoric(x):
    if x['carat'] <= 1:
        return '0.2 - 1'
    elif (x['carat'] > 1) & (x['carat'] < 3):
          return ' > 1 - 3'
    else:
          return '> 3'
        
d['carats_categoric'] = d.apply(categoric, axis=1)


d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53920 entries, 0 to 53919
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   carat             53920 non-null  float64
 1   cut               53920 non-null  object 
 2   cut_rating        53920 non-null  int64  
 3   clarity           53920 non-null  object 
 4   clarity_rating    53920 non-null  int64  
 5   color             53920 non-null  object 
 6   color_rating      53920 non-null  int64  
 7   price             53920 non-null  int64  
 8   price_by_carat    53920 non-null  float64
 9   depth             53920 non-null  float64
 10  table             53920 non-null  int64  
 11  x                 53920 non-null  float64
 12  y                 53920 non-null  float64
 13  z                 53920 non-null  float64
 14  carats_categoric  53920 non-null  object 
dtypes: float64(6), int64(5), object(4)
memory usage: 6.2+ MB


d.describe()


d['skirtumas'] = d.depth - round((d.z / ((d.x + d.y) / 2)) * 100, 1)


d['y_daugiau_uz_t'] = (d['y'] - ((d['table'] / 100) * d['y']))
d.describe()


diamonds = d[(d['skirtumas'] < 0.01) & (d['skirtumas'] > (-0.01))]
diamonds.describe()


d = diamonds.drop(columns=['skirtumas', 'y_daugiau_uz_t'])
d.to_csv('diamonds.csv')


import seaborn as sns
import matplotlib.pyplot as plt


d.groupby(['cut', 'cut_rating'])[['cut']].count().sort_values(by='cut_rating', ascending = False)


sns.countplot(x = 'cut', data = d, color = 'c')

<AxesSubplot:xlabel='cut', ylabel='count'>


d.groupby(['clarity', 'clarity_rating'])[['clarity']].count().sort_values(by='clarity_rating', ascending = False)


sns.countplot(x = 'clarity', data = d, color = 'c')

<AxesSubplot:xlabel='clarity', ylabel='count'>


d.groupby(['color', 'color_rating'])[['color']].count().sort_values(by='color_rating', ascending = False)


sns.countplot(x = 'color', data = d, color = 'c')

<AxesSubplot:xlabel='color', ylabel='count'>


categoric_by_cut = d.groupby(['carats_categoric', 'cut'])[['carat']].count().sort_values(by='carats_categoric', ascending = False)
categoric_by_cut


categoric_by_cut.idxmax()

carat    (0.2 - 1, Ideal)
dtype: object


sns.countplot(x = 'carats_categoric', data = d, color = 'c')

<AxesSubplot:xlabel='carats_categoric', ylabel='count'>


categoric_by_clarity = d.groupby(['carats_categoric', 'clarity'])[['clarity']].count().sort_values(by='carats_categoric', ascending = False)
categoric_by_clarity


categoric_by_clarity.idxmax()

clarity    (0.2 - 1, VS2)
dtype: object


categoric_by_color = d.groupby(['carats_categoric', 'color'])[['color']].count().sort_values(by='carats_categoric', ascending = False)
categoric_by_color


categoric_by_color.idxmax()

color    (0.2 - 1, E)
dtype: object


sns.countplot(x = 'depth', data = d, color = 'c')

<AxesSubplot:xlabel='depth', ylabel='count'>


sns.countplot(x = 'table', data = d, color = 'c')

<AxesSubplot:xlabel='table', ylabel='count'>


sns.countplot(x = 'x', data = d, color = 'c')

<AxesSubplot:xlabel='x', ylabel='count'>


sns.countplot(x = 'y', data = d, color = 'c')

<AxesSubplot:xlabel='y', ylabel='count'>


sns.countplot(x = 'z', data = d, color = 'c')

<AxesSubplot:xlabel='z', ylabel='count'>


group_cut = d.groupby(['carat', 'cut_rating'])[['price']].mean().reset_index()
by_cut = group_cut[['carat', 'price']]
group_clarity = d.groupby(['carat', 'clarity_rating'])[['price']].mean().reset_index()
by_clarity = group_clarity[['carat', 'price']]
group_color = d.groupby(['carat', 'color_rating'])[['price']].mean().reset_index()
by_color = group_color[['carat', 'price']]

by_carat_group_cut = d.groupby(['carat', 'cut_rating'])[['price_by_carat']].mean().reset_index()
by_carat_by_cut = by_carat_group_cut[['carat', 'price_by_carat']]
by_carat_by_cut = by_carat_group_cut[['carat', 'price_by_carat']]
by_carat_group_clarity = d.groupby(['carat', 'clarity_rating'])[['price_by_carat']].mean().reset_index()
by_carat_by_clarity = by_carat_group_clarity[['carat', 'price_by_carat']]
by_carat_group_color = d.groupby(['carat', 'color_rating'])[['price_by_carat']].mean().reset_index()
by_carat_by_color = by_carat_group_color[['carat', 'price_by_carat']]


by_cut.plot.scatter(x = 'carat', y = 'price', label='by_cut')

<AxesSubplot:xlabel='carat', ylabel='price'>


mmby_carat_by_cut = by_carat_by_cut.agg([min, max])
mmby_carat_by_cut


by_clarity.plot.scatter(x = 'carat', y = 'price', label='by_clarity')

<AxesSubplot:xlabel='carat', ylabel='price'>


mmby_carat_by_clarity = by_carat_by_clarity.agg([min, max])
mmby_carat_by_clarity


by_color.plot.scatter(x = 'carat', y = 'price', label='by_color')

<AxesSubplot:xlabel='carat', ylabel='price'>


mmby_carat_by_color = by_carat_by_color.agg([min, max])
mmby_carat_by_color


palyginimas = mmby_carat_by_cut.join(mmby_carat_by_clarity, lsuffix = '_cut', rsuffix = '_clarity').join(mmby_carat_by_color, rsuffix = '_color')
min_max_palyginimas = palyginimas[['price_by_carat_cut', 'price_by_carat_clarity', 'price_by_carat']]
min_max_palyginimas


max_price = d['price'].max()
d[d['price'] == max_price]


min_price = d['price'].min()
d[d['price'] == min_price]


d.sort_values(by='price', ascending=False)


from sklearn.linear_model import LinearRegression
karatai = 1


X = d[['carat']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
plt.scatter(X, y)
plt.plot(X, prediction)
r_squared = model.score(X, y)
r_squared

0.854884676974301


karatai = 1
max_cu = d.groupby(['cut', 'cut_rating'])[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_cut = max_cu['cut_rating'][0]
max_cl = d.groupby(['clarity', 'clarity_rating'])[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_clarity = max_cl['clarity_rating'][0]
max_co = d.groupby(['color', 'color_rating'])[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_color = max_co['color_rating'][0]


X = d[['carat', 'cut_rating', 'clarity_rating', 'color_rating']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_cut, max_clarity, max_color]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_d = model.score(X, y)
r_squared_by_d

0.9078412644480415


X = d[['carat', 'cut_rating']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_cut]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_cut = model.score(X, y)
r_squared_by_cut

0.8589672604653408


X = d[['carat', 'clarity_rating']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_clarity]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_clarity = model.score(X, y)
r_squared_by_clarity

0.8898027973449545


X = d[['carat', 'color_rating']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_color]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_color = model.score(X, y)
r_squared_by_color

0.8650031417930991


max_d = d.groupby('depth')[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_depth = max_d['depth'][0]
X = d[['carat', 'depth']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_depth]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_depth = model.score(X, y)
r_squared_by_depth

0.8559385593218629


max_t = d.groupby('table')[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_table = max_t['table'][0]
X = d[['carat', 'table']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_table]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_table = model.score(X, y)
r_squared_by_table

0.8562456844086875


max_xx = d.groupby('x')[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_x = max_xx['x'][0]
X = d[['carat', 'x']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_x]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_x = model.score(X, y)
r_squared_by_x

0.8607155265423889


max_yy = d.groupby('y')[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_y = max_yy['y'][0]
X = d[['carat', 'y']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_y]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_y = model.score(X, y)
r_squared_by_y

0.8593442780079718


max_zz = d.groupby('z')[['price']].count().sort_values(by='price', ascending=False).reset_index()
max_z = max_zz['z'][0]
X = d[['carat', 'z']]
y = d['price']
model = LinearRegression().fit(X, y)
model.predict([[karatai, max_z]])
d['prediction'] = 'prediction'
prediction = model.predict(X)
d['prediction'] = prediction
r_squared_by_z = model.score(X, y)
r_squared_by_z

0.8632533056355854


rezultatas = pd.Series([r_squared, r_squared_by_d, r_squared_by_cut, r_squared_by_clarity, r_squared_by_color, r_squared_by_depth, r_squared_by_table, r_squared_by_x, r_squared_by_y, r_squared_by_z], index=['only carat', 'by cut & clarity & color & carat', 'by cut & carat', 'by clarity & carat', 'by color & carat', 'by depth & carat', 'by table & carat', 'by x & carat', 'by y & carat', 'by z & carat'])
rezultatas.sort_values(ascending=False)

by cut & clarity & color & carat    0.907841
by clarity & carat                  0.889803
by color & carat                    0.865003
by z & carat                        0.863253
by x & carat                        0.860716
by y & carat                        0.859344
by cut & carat                      0.858967
by table & carat                    0.856246
by depth & carat                    0.855939
only carat                          0.854885
dtype: float64


# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# X = d[['carat']]
# y = d['price']
# train_test_split(X, y, test_size=0.3)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# model = LogisticRegression().fit(X_train, y_train)
# y_predicted = model.predict(X_test)

# # from sklearn.linear_model import LogisticRegression
# # from sklearn.model_selection import train_test_split
# # X = d[['carat', 'cut_rating', 'clarity_rating', 'color_rating']]
# # y = d['price']
# # train_test_split(X, y, test_size=0.3)
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# # model = LogisticRegression()
# # model.fit(X_train, y_train)
# # y_predicted = model.predict(X_test)


# from sklearn import metrics
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# model = LogisticRegression().fit(X_train, y_train)
# y_predicted = model.predict(X_test)
# model.score(X_test, y_test)
# metrics.confusion_matrix(y_test, y_predicted)

	carat	cut	cut_rating	clarity	clarity_rating	color	color_rating	price	price_by_carat	depth	table	x	y	z
0	0.23	Ideal	10	SI2	4	E	9	326	1417.0	61.5	55	3.95	3.98	2.43
1	0.21	Premium	9	SI1	5	E	9	326	1552.0	59.8	61	3.89	3.84	2.31
2	0.23	Good	7	VS1	7	E	9	327	1422.0	56.9	65	4.05	4.07	2.31
3	0.29	Premium	9	VS2	6	I	5	334	1152.0	62.4	58	4.20	4.23	2.63
4	0.31	Good	7	SI2	4	J	4	335	1081.0	63.3	58	4.34	4.35	2.75
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
53915	0.72	Ideal	10	SI1	5	D	10	2757	3829.0	60.8	57	5.75	5.76	3.50
53916	0.72	Good	7	SI1	5	D	10	2757	3829.0	63.1	55	5.69	5.75	3.61
53917	0.70	Very Good	8	SI1	5	D	10	2757	3939.0	62.8	60	5.66	5.68	3.56
53918	0.86	Premium	9	SI2	4	H	6	2757	3206.0	61.0	58	6.15	6.12	3.74
53919	0.75	Ideal	10	SI2	4	D	10	2757	3676.0	62.2	55	5.83	5.87	3.64

	carat	cut_rating	clarity_rating	color_rating	price	price_by_carat	depth	table	x	y	z
count	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000
mean	0.797698	8.904228	6.051502	7.405972	3930.993231	4007.835757	61.749514	57.457808	5.731627	5.734887	3.540046
std	0.473795	1.116579	1.647005	1.701272	3987.280446	2012.173373	1.432331	2.233989	1.119423	1.140126	0.702530
min	0.200000	6.000000	3.000000	4.000000	326.000000	1051.000000	43.000000	43.000000	3.730000	3.680000	1.070000
25%	0.400000	8.000000	5.000000	6.000000	949.000000	2477.750000	61.000000	56.000000	4.710000	4.720000	2.910000
50%	0.700000	9.000000	6.000000	7.000000	2401.000000	3495.000000	61.800000	57.000000	5.700000	5.710000	3.530000
75%	1.040000	10.000000	7.000000	9.000000	5323.250000	4949.000000	62.500000	59.000000	6.540000	6.540000	4.040000
max	5.010000	10.000000	10.000000	10.000000	18823.000000	17829.000000	79.000000	95.000000	10.740000	58.900000	31.800000

	carat	cut_rating	clarity_rating	color_rating	price	price_by_carat	depth	table	x	y	z	skirtumas	y_daugiau_uz_t
count	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000	53920.000000
mean	0.797698	8.904228	6.051502	7.405972	3930.993231	4007.835757	61.749514	57.457808	5.731627	5.734887	3.540046	-0.009243	2.435048
std	0.473795	1.116579	1.647005	1.701272	3987.280446	2012.173373	1.432331	2.233989	1.119423	1.140126	0.702530	2.452803	0.478514
min	0.200000	6.000000	3.000000	4.000000	326.000000	1051.000000	43.000000	43.000000	3.730000	3.680000	1.070000	-557.500000	0.415500
25%	0.400000	8.000000	5.000000	6.000000	949.000000	2477.750000	61.000000	56.000000	4.710000	4.720000	2.910000	0.000000	2.024400
50%	0.700000	9.000000	6.000000	7.000000	2401.000000	3495.000000	61.800000	57.000000	5.700000	5.710000	3.530000	0.000000	2.394200
75%	1.040000	10.000000	7.000000	9.000000	5323.250000	4949.000000	62.500000	59.000000	6.540000	6.540000	4.040000	0.000000	2.776800
max	5.010000	10.000000	10.000000	10.000000	18823.000000	17829.000000	79.000000	95.000000	10.740000	58.900000	31.800000	44.500000	25.327000

	carat	cut_rating	clarity_rating	color_rating	price	price_by_carat	depth	table	x	y	z	skirtumas	y_daugiau_uz_t
count	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.000000	50151.0	50151.000000
mean	0.795001	8.902136	6.047257	7.433551	3929.510119	4013.239676	61.745527	57.492572	5.725679	5.728136	3.535584	0.0	2.430213
std	0.470870	1.100926	1.633693	1.708803	3988.853886	2015.160759	1.400397	2.203546	1.117092	1.110107	0.689070	0.0	0.464031
min	0.200000	6.000000	3.000000	4.000000	326.000000	1051.000000	51.000000	44.000000	3.730000	3.680000	2.240000	0.0	0.415500
25%	0.400000	8.000000	5.000000	6.000000	945.000000	2477.000000	61.000000	56.000000	4.710000	4.720000	2.910000	0.0	2.020500
50%	0.700000	9.000000	6.000000	7.000000	2401.000000	3504.000000	61.800000	57.000000	5.690000	5.710000	3.520000	0.0	2.394000
75%	1.040000	10.000000	7.000000	9.000000	5321.500000	4955.000000	62.500000	59.000000	6.530000	6.530000	4.030000	0.0	2.773500
max	4.130000	10.000000	10.000000	10.000000	18823.000000	17829.000000	78.200000	95.000000	10.140000	10.100000	6.430000	0.0	4.483200

		cut
cut	cut_rating
Ideal	10	19611
Premium	9	13269
Very Good	8	11342
Good	7	4610
Fair	6	1319

Deimanto kaina labai priklauso nuo kokybės parametrų ir galima nuspėti kainą, žinant juos.

Tikslai:¶

Peržiūrime kokie yra deimanto kokybės parametrai, kaip jie reitinguojami ir koks jų pasiskirstymas duomenyse.¶

Peržiūrime kokie yra deimanto išmatavimo parametrai.¶

Apskaičiuojame kokie yra deimantų kainų vidurkiai, pagal kokybės parametrus ir surandame didžiausią ir mažiausią kainą už karatą, pagal visus parametrus.¶

Palyginame, kaip skiriasi deimanto kainos už karatą, pagal kokybės parametrus.¶

Kuriame modelį, pagal kurį galima būtų nuspėti deimanto kainą, turint karatų kiekį.¶

Išvados¶

		clarity
clarity	clarity_rating
IF	10	1651
VVS1	9	3367
VVS2	8	4605
VS1	7	7486
VS2	6	11606
SI1	5	12398
SI2	4	8481
I1	3	557

		color
color	color_rating
D	10	6545
E	9	9331
F	8	8770
G	7	10352
H	6	7582
I	5	4971
J	4	2600

		carat
carats_categoric	cut
> 3	Fair	8
	Good	5
	Ideal	3
	Premium	11
	Very Good	2
0.2 - 1	Fair	785
	Good	3127
	Ideal	14567
	Premium	7843
	Very Good	7611
> 1 - 3	Fair	526
	Good	1478
	Ideal	5041
	Premium	5415
	Very Good	3729

	carat	cut	cut_rating	clarity	clarity_rating	color	color_rating	price	price_by_carat	depth	table	x	y	z	carats_categoric
27732	2.29	Premium	9	VS2	6	I	5	18823	8220.0	60.8	60	8.50	8.47	5.16	> 1 - 3
27731	2.00	Very Good	8	SI1	5	G	7	18818	9409.0	63.5	56	7.90	7.97	5.04	> 1 - 3
27730	1.51	Ideal	10	IF	10	G	7	18806	12454.0	61.7	55	7.37	7.41	4.56	> 1 - 3
27728	2.00	Very Good	8	SI1	5	H	6	18803	9402.0	62.8	57	7.95	8.00	5.01	> 1 - 3
27727	2.29	Premium	9	SI1	5	I	5	18797	8208.0	61.8	59	8.52	8.45	5.24	> 1 - 3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5	0.24	Very Good	8	VVS2	8	J	4	336	1400.0	62.8	57	3.94	3.96	2.48	0.2 - 1
4	0.31	Good	7	SI2	4	J	4	335	1081.0	63.3	58	4.34	4.35	2.75	0.2 - 1
3	0.29	Premium	9	VS2	6	I	5	334	1152.0	62.4	58	4.20	4.23	2.63	0.2 - 1
2	0.23	Good	7	VS1	7	E	9	327	1422.0	56.9	65	4.05	4.07	2.31	0.2 - 1
1	0.21	Premium	9	SI1	5	E	9	326	1552.0	59.8	61	3.89	3.84	2.31	0.2 - 1

		color
carats_categoric	color
> 3	J	6
	I	14
	H	5
	G	2
	E	1
	D	1
0.2 - 1	H	4247
	J	1077
	I	2505
	G	6923
	F	6347
	E	7530
	D	5304
> 1 - 3	E	1800
	J	1517
	I	2452
	H	3330
	G	3427
	F	2423
	D	1240

	carat	price_by_carat
min	0.20	1374.0
max	4.13	10277.5

	carat	price_by_carat
min	0.20	1271.0
max	4.13	13861.0

	carat	price_by_carat
min	0.20	1339.0
max	4.13	11971.0