This commit is contained in:
KP9lK 2024-10-22 21:28:36 +03:00
commit 0c6ad3d920
9 changed files with 1391 additions and 0 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (mlmodule1)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (mlmodule1)" project-jdk-type="Python SDK" />
</project>

10
.idea/mlmodule1.iml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/mlmodule1.iml" filepath="$PROJECT_DIR$/.idea/mlmodule1.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

70
clearing_data.py Normal file
View File

@ -0,0 +1,70 @@
import pandas as pd
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("raw_dataset.csv")
columns = [
"experience",
"employment",
"salary_min",
"salary_max",
]
columns_for_cor = [
"experience",
"employment",
"salary_min",
"salary_max",
"area_Краснодар",
"area_Москва",
"area_Санкт-Петербург"
]
schedule = {
"Нет опыта": 0,
"От 1 года до 3 лет": 1,
"От 3 до 6 лет": 2,
"Более 6 лет": 3
}
employements = {
"Полная занятость": 0,
"Частичная занятость": 1,
"Проектная работа": 2,
"Волонтерство": 3,
"Стажировка": 4
}
data["area"] = data["area"].map(lambda city: ast.literal_eval(city)["name"])
data["experience"] = data["experience"].map(lambda experience: ast.literal_eval(experience)["name"])
data["employment"] = data["employment"].map(lambda employment: ast.literal_eval(employment)["name"])
data["salary_min"] = data["salary"].map(
lambda salary_min: ast.literal_eval(salary_min)["from"]
if ast.literal_eval(salary_min)["from"] is not None
else ast.literal_eval(salary_min)["to"]
)
data["salary_max"] = data["salary"].map(
lambda salary_max: ast.literal_eval(salary_max)["to"]
if ast.literal_eval(salary_max)["to"] is not None
else ast.literal_eval(salary_max)["from"]
)
data["experience"] = data["experience"].map(lambda experience: schedule[experience])
data["employment"] = data["employment"].map(lambda employment: employements[employment])
data = data.groupby("area").filter(lambda values: len(values) > 30)
data = pd.get_dummies(data, columns=["area"])
data_cor = data[columns_for_cor].corr()
sns.heatmap(data_cor)
plt.show()
print(data_cor)

43
extract.py Normal file
View File

@ -0,0 +1,43 @@
import requests
import pandas as pd
params = {
"page": 0,
"per_page": 100,
"text": "bi",
"area": "113",
"currency": "RUR",
"date_from": "2024-07-22",
"only_with_salary": True
}
base_url = "https://api.hh.ru/vacancies"
result = requests.get(base_url, params).json()
vacancies = []
vacancies.extend(result["items"])
for i in range(1, result["pages"]):
params["page"] += 1
result = requests.get(base_url, params).json()
vacancies.extend(result["items"])
data = pd.DataFrame.from_dict(vacancies)
data.info()
columns = [
"area",
"salary",
"schedule",
"url",
"experience",
"employment"
]
data = data[columns]
data.to_csv("raw_dataset.csv")

1238
raw_dataset.csv Normal file

File diff suppressed because it is too large Load Diff