ml_module1/clearing_data.py

71 lines
1.8 KiB
Python
Raw Normal View History

2024-10-22 18:28:36 +00:00
import pandas as pd
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("raw_dataset.csv")
columns = [
"experience",
"employment",
"salary_min",
"salary_max",
]
columns_for_cor = [
"experience",
"employment",
"salary_min",
"salary_max",
"area_Краснодар",
"area_Москва",
"area_Санкт-Петербург"
]
schedule = {
"Нет опыта": 0,
"От 1 года до 3 лет": 1,
"От 3 до 6 лет": 2,
"Более 6 лет": 3
}
employements = {
"Полная занятость": 0,
"Частичная занятость": 1,
"Проектная работа": 2,
"Волонтерство": 3,
"Стажировка": 4
}
data["area"] = data["area"].map(lambda city: ast.literal_eval(city)["name"])
data["experience"] = data["experience"].map(lambda experience: ast.literal_eval(experience)["name"])
data["employment"] = data["employment"].map(lambda employment: ast.literal_eval(employment)["name"])
data["salary_min"] = data["salary"].map(
lambda salary_min: ast.literal_eval(salary_min)["from"]
if ast.literal_eval(salary_min)["from"] is not None
else ast.literal_eval(salary_min)["to"]
)
data["salary_max"] = data["salary"].map(
lambda salary_max: ast.literal_eval(salary_max)["to"]
if ast.literal_eval(salary_max)["to"] is not None
else ast.literal_eval(salary_max)["from"]
)
data["experience"] = data["experience"].map(lambda experience: schedule[experience])
data["employment"] = data["employment"].map(lambda employment: employements[employment])
data = data.groupby("area").filter(lambda values: len(values) > 30)
data = pd.get_dummies(data, columns=["area"])
data_cor = data[columns_for_cor].corr()
sns.heatmap(data_cor)
plt.show()
print(data_cor)