import mysql.connector as con
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

def upload_csv_to_mysql(csv_file_path, database_url, table_name):
    try:
        # Perskaityti CSV failą ir surašyti data i dataframe'a
        data = pd.read_csv(csv_file_path)
        print(f"CSV file '{csv_file_path}' loaded successfully.\n")

        engine = create_engine(database_url)
        print(f"Connected to the MySQL database {database_url}.\n")

        # Įkelti dataframe'a į mySQL serverį
        data.to_sql(table_name, engine, if_exists='replace', index=False)
        print(f"Data uploaded successfully to table '{table_name}'.\n")

    except Exception as e:
        print(f"An error occurred: {e}\n")

PATH_TO_TABLE = {"PopularityFULL.csv": "popularity", "salaryUpdated.csv": "salaries"}
DATABASE_URL = "mysql+pymysql://root:CounterStrike!@localhost/baigiamasis"
for PATH, TABLE in PATH_TO_TABLE.items():
    upload_csv_to_mysql(PATH, DATABASE_URL, TABLE)

CSV file 'PopularityFULL.csv' loaded successfully.

Connected to the MySQL database mysql+pymysql://root:CounterStrike!@localhost/baigiamasis.

Data uploaded successfully to table 'popularity'.

CSV file 'salaryUpdated.csv' loaded successfully.

Connected to the MySQL database mysql+pymysql://root:CounterStrike!@localhost/baigiamasis.

Data uploaded successfully to table 'salaries'.

my_db = con.connect(
    host='localhost',
    port='3306',
    user='root',
    password='CounterStrike!'
)

popularity = pd.read_sql('SELECT * FROM baigiamasis.popularity', con=my_db)
salaries = pd.read_sql('SELECT * FROM baigiamasis.salaries', con=my_db).set_index('row_index')

C:\Users\augus\AppData\Local\Temp\ipykernel_143240\1297533587.py:1: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  popularity = pd.read_sql('SELECT * FROM baigiamasis.popularity', con=my_db)
C:\Users\augus\AppData\Local\Temp\ipykernel_143240\1297533587.py:2: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  salaries = pd.read_sql('SELECT * FROM baigiamasis.salaries', con=my_db).set_index('row_index')

popularity.head(10)

unique_id = dict(enumerate(popularity['Attribute'].unique()))
unique_id = {k:v for v, k in unique_id.items()}
unique_id

{'Abap': 0,
 'Ada': 1,
 'C/C++': 2,
 'C#': 3,
 'Cobol': 4,
 'Dart': 5,
 'Delphi/Pascal': 6,
 'Go': 7,
 'Groovy': 8,
 'Haskell': 9,
 'Java': 10,
 'JavaScript': 11,
 'Julia': 12,
 'Kotlin': 13,
 'Lua': 14,
 'Matlab': 15,
 'Objective-C': 16,
 'Perl': 17,
 'PHP': 18,
 'Powershell': 19,
 'Python': 20,
 'R': 21,
 'Ruby': 22,
 'Rust': 23,
 'Scala': 24,
 'Swift': 25,
 'TypeScript': 26,
 'VBA': 27,
 'Visual Basic': 28}

def toID(x):
    return unique_id[x]
popularity['id'] = popularity['Attribute'].map(toID)
popularity.head(5)

popularity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7134 entries, 0 to 7133
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       7134 non-null   object 
 1   Attribute  7134 non-null   object 
 2   Value      7134 non-null   float64
 3   id         7134 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 223.1+ KB

type(popularity['Date'].iloc[0])

str

popularity['Date'] = popularity['Date'].map(lambda x : dt.datetime.strptime(x, "%m/%d/%Y"))

popularity

popularity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7134 entries, 0 to 7133
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       7134 non-null   datetime64[ns]
 1   Attribute  7134 non-null   object        
 2   Value      7134 non-null   float64       
 3   id         7134 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 223.1+ KB

salaries.head(10)

salaries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16524 entries, 0 to 16533
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           16524 non-null  int64 
 1   experience_level    16524 non-null  object
 2   employment_type     16524 non-null  object
 3   job_title           16524 non-null  object
 4   salary              16524 non-null  int64 
 5   salary_currency     16524 non-null  object
 6   salary_in_usd       16524 non-null  int64 
 7   employee_residence  16524 non-null  object
 8   remote_ratio        16524 non-null  int64 
 9   company_location    16524 non-null  object
 10  company_size        16524 non-null  object
 11  languages           16524 non-null  object
dtypes: int64(4), object(8)
memory usage: 1.6+ MB

salaries['languages'] = salaries['languages'].str.split(',')

salaries['languages'][2]

['Python', ' SQL', ' Java', ' Scala']

salaries['languages'] = salaries['languages'].apply(lambda x: [i.strip() for i in x])

salaries['languages'][2]

['Python', 'SQL', 'Java', 'Scala']

salaries_exploded = salaries.explode('languages')

salaries_exploded.head(5)

salaries_exploded.groupby('languages')['salary'].mean().sort_values(ascending=False).round(2)

languages
C++           201344.38
MATLAB        192017.30
Java          167721.62
Python        164913.09
R             159459.32
JavaScript    156466.00
Scala         151889.66
SQL           149375.53
Go            133980.59
Power BI      122373.92
Name: salary, dtype: float64

salaries

python_jobs = salaries[salaries['languages'].map(lambda x: 'Python' in x)]

f"{round((python_jobs.shape[0] / salaries.shape[0]) * 100)}% Darbų naudoja python programavimo kalbą"

'94% Darbų naudoja python programavimo kalbą'

python_df = popularity[popularity["Attribute"] == "Python"].sort_values("Date")

plt.figure(figsize=(12, 6))
plt.plot(python_df["Date"], python_df["Value"], marker="o", linestyle="-", color="b", label="Python")

plt.xlabel("Metai")
plt.ylabel("Populiarumas (%)")
plt.title("Python Kalbos Populiarumas pagal metus")
plt.grid(True)
plt.xticks(rotation=45)

plt.show()

popularity["Year"] = popularity["Date"].dt.year

python_df = popularity[popularity["Attribute"].str.lower() == "python"].sort_values("Year")

X = python_df["Year"].values.reshape(-1, 1)
y = python_df["Value"].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)

future_years = np.array(range(X[-1][0] + 1, X[-1][0] + 11)).reshape(-1, 1)
future_popularity = model.predict(future_years)


plt.figure(figsize=(12, 6))
plt.scatter(X, y, color="blue", label="Tikra Data")
plt.plot(X, model.predict(X), color="green", linestyle="--", label="Vidutinė linija")
plt.plot(future_years, future_popularity, color="red", marker="o", linestyle="-", label="Numatomas Populiarumas")
plt.xlabel("Metai")
plt.ylabel("Populiarumas (%)")
plt.title("Python Populiarumas Po 10 Metų")
plt.legend()
plt.grid(True)

plt.show()

scores = []
for i in range(3):
    scores.append(model.score(X_test, y_test))
avg_score = np.mean(scores)
if avg_score.round(2) >= 0.89:
    print(f"Šis mašininio mokymosi modelis yra tikslus, su tikslumu: {avg_score.round(2) * 100}%")
else:
    print(f"Šis mašininio mokymosi modelis nėra tikslus, su tikslumu: {avg_score.round(2) * 100}%")

Šis mašininio mokymosi modelis yra tikslus, su tikslumu: 90.0%

salaries

salaries_avg = salaries.groupby("work_year", as_index=False)["salary_in_usd"].mean()

X = salaries_avg["work_year"].values.reshape(-1, 1)
y = salaries_avg["salary_in_usd"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

future_years = np.arange(int(salaries_avg["work_year"].max()) + 1, int(salaries_avg["work_year"].max()) + 16, 1).reshape(-1, 1)
future_salaries = model.predict(future_years)

plt.figure(figsize=(10, 5))
plt.scatter(salaries_avg["work_year"], salaries_avg["salary_in_usd"], color="blue", label="Actual Average Salaries")
plt.plot(future_years, future_salaries, color="red", linestyle="dashed", label="Predicted Salaries")
plt.gca().tick_params(axis='x', which='major', pad=10)
plt.xticks(np.arange(salaries_avg["work_year"].min(), salaries_avg["work_year"].max() + 16, 1))

plt.xlabel("Metai")
plt.ylabel("Vidutinė alga (USD)")
plt.title("AI Programuotojų algos per kitus 15 metų")
plt.legend()
plt.grid(True)
plt.show()

a = round(salaries_avg['salary_in_usd'].iloc[0])
b = round(future_salaries[-1])
percent_difference = ((b - a) / a) * 100

f"{round(percent_difference)}%"

'315%'

scores = []
for i in range(3):
    scores.append(model.score(X_test, y_test))
avg_score = np.mean(scores)
if avg_score.round(2) >= 0.89:
    print(f"Šis mašininio mokymosi modelis yra tikslus, su tikslumu: {avg_score.round(2) * 100}%")
else:
    print(f"Šis mašininio mokymosi modelis nėra tikslus, su tikslumu: {avg_score.round(2) * 100}%")

Šis mašininio mokymosi modelis nėra tikslus, su tikslumu: 44.0%

	Date	Attribute	Value
0	7/1/2004	Abap	0.34
1	7/1/2004	Ada	0.36
2	7/1/2004	C/C++	10.01
3	7/1/2004	C#	4.68
4	7/1/2004	Cobol	0.42
5	7/1/2004	Dart	0.00
6	7/1/2004	Delphi/Pascal	2.80
7	7/1/2004	Go	0.00
8	7/1/2004	Groovy	0.03
9	7/1/2004	Haskell	0.22

	Date	Attribute	Value	id
0	7/1/2004	Abap	0.34	0
1	7/1/2004	Ada	0.36	1
2	7/1/2004	C/C++	10.01	2
3	7/1/2004	C#	4.68	3
4	7/1/2004	Cobol	0.42	4

	Date	Attribute	Value	id
0	2004-07-01	Abap	0.34	0
1	2004-07-01	Ada	0.36	1
2	2004-07-01	C/C++	10.01	2
3	2004-07-01	C#	4.68	3
4	2004-07-01	Cobol	0.42	4
...	...	...	...	...
7129	2024-12-01	Scala	0.49	24
7130	2024-12-01	Swift	2.59	25
7131	2024-12-01	TypeScript	2.87	26
7132	2024-12-01	VBA	0.98	27
7133	2024-12-01	Visual Basic	0.46	28

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	languages
row_index
0	2024	SE	FT	AI Engineer	202730	USD	202730	US	0	US	M	Python
1	2024	SE	FT	AI Engineer	92118	USD	92118	US	0	US	M	Python
2	2024	SE	FT	Data Engineer	130500	USD	130500	US	0	US	M	Python, SQL, Java, Scala
3	2024	SE	FT	Data Engineer	96000	USD	96000	US	0	US	M	Python, SQL, Java, Scala
4	2024	SE	FT	Machine Learning Engineer	190000	USD	190000	US	0	US	M	Python, Java, C++
5	2024	SE	FT	Machine Learning Engineer	160000	USD	160000	US	0	US	M	Python, Java, C++
6	2024	MI	FT	ML Engineer	400000	USD	400000	US	0	US	M	Python, C++
7	2024	MI	FT	ML Engineer	65000	USD	65000	US	0	US	M	Python, C++
8	2024	EN	FT	Data Analyst	101520	USD	101520	US	0	US	M	SQL, Python, R
9	2024	EN	FT	Data Analyst	45864	USD	45864	US	0	US	M	SQL, Python, R

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	languages
row_index
0	2024	SE	FT	AI Engineer	202730	USD	202730	US	0	US	M	[Python]
1	2024	SE	FT	AI Engineer	92118	USD	92118	US	0	US	M	[Python]
2	2024	SE	FT	Data Engineer	130500	USD	130500	US	0	US	M	[Python, SQL, Java, Scala]
3	2024	SE	FT	Data Engineer	96000	USD	96000	US	0	US	M	[Python, SQL, Java, Scala]
4	2024	SE	FT	Machine Learning Engineer	190000	USD	190000	US	0	US	M	[Python, Java, C++]
...	...	...	...	...	...	...	...	...	...	...	...	...
16529	2020	SE	FT	Data Scientist	412000	USD	412000	US	100	US	L	[Python, R, SQL]
16530	2021	MI	FT	Principal Data Scientist	151000	USD	151000	US	100	US	L	[Python, R]
16531	2020	EN	FT	Data Scientist	105000	USD	105000	US	100	US	S	[Python, R, SQL]
16532	2020	EN	CT	Business Data Analyst	100000	USD	100000	US	100	US	L	[SQL, Python]
16533	2021	SE	FT	Data Science Manager	7000000	INR	94665	IN	50	IN	L	[Python, R]

Ivadas¶

Tyrimo tikslas¶

Hipotezės¶

Importojam modulius¶

Įrašau duomenis iš CSV failų į mySQL serverį¶

Pasižiūriu kaip atrodo duomenys, juos sutvarkau¶

Hipotezių tikrinimas¶

Pirmos hipotezės tikrinimas¶

Antros hipotezės tikrinimas¶

Trečios hipotezės tikrinimas¶

Ketvirtos hipotezės tikrinimas¶

Penktos hipotezės tikrinimas¶

Išvados¶