normal distrib + min analyze
This commit is contained in:
parent
0c6ad3d920
commit
ad1ee91a78
39
analyze.py
Normal file
39
analyze.py
Normal file
@ -0,0 +1,39 @@
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import shapiro
|
||||
import pandas as pd
|
||||
|
||||
data = pd.read_csv("cleared.csv")
|
||||
|
||||
columns_for_cor = [
|
||||
"experience",
|
||||
"employment",
|
||||
"salary_min",
|
||||
"salary_max",
|
||||
"area_краснодар",
|
||||
"area_москва",
|
||||
"area_санкт-петербург"
|
||||
]
|
||||
|
||||
data["area"] = data["area"].map(lambda area: str(area).lower())
|
||||
|
||||
data["area"] = data["area"].astype("string")
|
||||
|
||||
_, p_min = shapiro(data["salary_min"])
|
||||
_, p_max = shapiro(data["salary_max"])
|
||||
|
||||
data.info()
|
||||
|
||||
sns.kdeplot(data[["salary_max","salary_min"]])
|
||||
sns.boxplot(data[["salary_max","salary_min"]])
|
||||
|
||||
data = data.groupby("area").filter(lambda count: len(count) > 30)
|
||||
|
||||
data_dum = pd.get_dummies(data, columns=["area"])
|
||||
print(data_dum.columns)
|
||||
data_dum.info()
|
||||
sns.heatmap(data_dum[columns_for_cor].corr())
|
||||
plt.show()
|
||||
print(data[["salary_min", "salary_max", "area"]].groupby("area").mean())
|
||||
|
||||
print(data.info())
|
1238
cleared.csv
Normal file
1238
cleared.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,14 @@
|
||||
import pandas as pd
|
||||
import ast
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
data = pd.read_csv("raw_dataset.csv")
|
||||
columns = [
|
||||
"experience",
|
||||
"employment",
|
||||
"salary_min",
|
||||
"salary_max",
|
||||
"area"
|
||||
]
|
||||
|
||||
columns_for_cor = [
|
||||
@ -56,15 +56,5 @@ data["salary_max"] = data["salary"].map(
|
||||
data["experience"] = data["experience"].map(lambda experience: schedule[experience])
|
||||
data["employment"] = data["employment"].map(lambda employment: employements[employment])
|
||||
|
||||
data = data.groupby("area").filter(lambda values: len(values) > 30)
|
||||
data[columns].to_csv("cleared.csv")
|
||||
|
||||
data = pd.get_dummies(data, columns=["area"])
|
||||
|
||||
|
||||
|
||||
|
||||
data_cor = data[columns_for_cor].corr()
|
||||
|
||||
sns.heatmap(data_cor)
|
||||
plt.show()
|
||||
print(data_cor)
|
||||
|
Loading…
Reference in New Issue
Block a user