normal distrib + min analyze

This commit is contained in:
KP9lK 2024-10-23 22:12:41 +03:00
parent 0c6ad3d920
commit ad1ee91a78
3 changed files with 1280 additions and 13 deletions

39
analyze.py Normal file
View File

@ -0,0 +1,39 @@
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
import pandas as pd
data = pd.read_csv("cleared.csv")
columns_for_cor = [
"experience",
"employment",
"salary_min",
"salary_max",
"area_краснодар",
"area_москва",
"area_санкт-петербург"
]
data["area"] = data["area"].map(lambda area: str(area).lower())
data["area"] = data["area"].astype("string")
_, p_min = shapiro(data["salary_min"])
_, p_max = shapiro(data["salary_max"])
data.info()
sns.kdeplot(data[["salary_max","salary_min"]])
sns.boxplot(data[["salary_max","salary_min"]])
data = data.groupby("area").filter(lambda count: len(count) > 30)
data_dum = pd.get_dummies(data, columns=["area"])
print(data_dum.columns)
data_dum.info()
sns.heatmap(data_dum[columns_for_cor].corr())
plt.show()
print(data[["salary_min", "salary_max", "area"]].groupby("area").mean())
print(data.info())

1238
cleared.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,14 @@
import pandas as pd import pandas as pd
import ast import ast
import numpy as np import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("raw_dataset.csv") data = pd.read_csv("raw_dataset.csv")
columns = [ columns = [
"experience", "experience",
"employment", "employment",
"salary_min", "salary_min",
"salary_max", "salary_max",
"area"
] ]
columns_for_cor = [ columns_for_cor = [
@ -56,15 +56,5 @@ data["salary_max"] = data["salary"].map(
data["experience"] = data["experience"].map(lambda experience: schedule[experience]) data["experience"] = data["experience"].map(lambda experience: schedule[experience])
data["employment"] = data["employment"].map(lambda employment: employements[employment]) data["employment"] = data["employment"].map(lambda employment: employements[employment])
data = data.groupby("area").filter(lambda values: len(values) > 30) data[columns].to_csv("cleared.csv")
data = pd.get_dummies(data, columns=["area"])
data_cor = data[columns_for_cor].corr()
sns.heatmap(data_cor)
plt.show()
print(data_cor)