ml_module1/clearing_data.py
2024-10-22 21:28:36 +03:00

71 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("raw_dataset.csv")
columns = [
"experience",
"employment",
"salary_min",
"salary_max",
]
columns_for_cor = [
"experience",
"employment",
"salary_min",
"salary_max",
"area_Краснодар",
"area_Москва",
"area_Санкт-Петербург"
]
schedule = {
"Нет опыта": 0,
"От 1 года до 3 лет": 1,
"От 3 до 6 лет": 2,
"Более 6 лет": 3
}
employements = {
"Полная занятость": 0,
"Частичная занятость": 1,
"Проектная работа": 2,
"Волонтерство": 3,
"Стажировка": 4
}
data["area"] = data["area"].map(lambda city: ast.literal_eval(city)["name"])
data["experience"] = data["experience"].map(lambda experience: ast.literal_eval(experience)["name"])
data["employment"] = data["employment"].map(lambda employment: ast.literal_eval(employment)["name"])
data["salary_min"] = data["salary"].map(
lambda salary_min: ast.literal_eval(salary_min)["from"]
if ast.literal_eval(salary_min)["from"] is not None
else ast.literal_eval(salary_min)["to"]
)
data["salary_max"] = data["salary"].map(
lambda salary_max: ast.literal_eval(salary_max)["to"]
if ast.literal_eval(salary_max)["to"] is not None
else ast.literal_eval(salary_max)["from"]
)
data["experience"] = data["experience"].map(lambda experience: schedule[experience])
data["employment"] = data["employment"].map(lambda employment: employements[employment])
data = data.groupby("area").filter(lambda values: len(values) > 30)
data = pd.get_dummies(data, columns=["area"])
data_cor = data[columns_for_cor].corr()
sns.heatmap(data_cor)
plt.show()
print(data_cor)