import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import shapiro import pandas as pd data = pd.read_csv("cleared.csv") columns_for_cor = [ "experience", "employment", "salary_min", "salary_max", "area_краснодар", "area_москва", "area_санкт-петербург" ] data["area"] = data["area"].map(lambda area: str(area).lower()) data["area"] = data["area"].astype("string") _, p_min = shapiro(data["salary_min"]) _, p_max = shapiro(data["salary_max"]) data.info() sns.kdeplot(data[["salary_max","salary_min"]]) sns.boxplot(data[["salary_max","salary_min"]]) data = data.groupby("area").filter(lambda count: len(count) > 30) data_dum = pd.get_dummies(data, columns=["area"]) print(data_dum.columns) data_dum.info() sns.heatmap(data_dum[columns_for_cor].corr()) plt.show() print(data[["salary_min", "salary_max", "area"]].groupby("area").mean()) print(data.info())