1231/module V.ipynb
2025-02-23 21:54:44 +03:00

1109 lines
41 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d792e3b7-4590-4545-a343-2c9ee606f50e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: natasha in c:\\programdata\\anaconda3\\lib\\site-packages (1.6.0)\n",
"Requirement already satisfied: pymorphy2 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.1)\n",
"Requirement already satisfied: razdel>=0.5.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.5.0)\n",
"Requirement already satisfied: navec>=0.9.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.10.0)\n",
"Requirement already satisfied: slovnet>=0.6.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.6.0)\n",
"Requirement already satisfied: yargy>=0.16.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.16.0)\n",
"Requirement already satisfied: ipymarkup>=0.8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.0)\n",
"Requirement already satisfied: intervaltree>=3 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipymarkup>=0.8.0->natasha) (3.1.0)\n",
"Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from navec>=0.9.0->natasha) (1.26.4)\n",
"Requirement already satisfied: dawg-python>=0.7.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.7.2)\n",
"Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (2.4.417127.4579844)\n",
"Requirement already satisfied: docopt>=0.6 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.6.2)\n",
"Requirement already satisfied: sortedcontainers<3.0,>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from intervaltree>=3->ipymarkup>=0.8.0->natasha) (2.4.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# устанавливаем библиотеку natasha для дальнейших работ\n",
"pip install natasha"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c6a64528-8064-41aa-83f1-591096e03080",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: bs4 in c:\\programdata\\anaconda3\\lib\\site-packages (0.0.2)\n",
"Requirement already satisfied: beautifulsoup4 in c:\\programdata\\anaconda3\\lib\\site-packages (from bs4) (4.12.3)\n",
"Requirement already satisfied: soupsieve>1.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from beautifulsoup4->bs4) (2.5)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# устанавливаем библиотеку bs4 (Beautiful Soup 4) для дальнейших работ\n",
"pip install bs4"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "407b9639-427f-4e2d-98a0-bafc0bba45f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recommendationid</th>\n",
" <th>language</th>\n",
" <th>review</th>\n",
" <th>timestamp_created</th>\n",
" <th>timestamp_updated</th>\n",
" <th>voted_up</th>\n",
" <th>votes_up</th>\n",
" <th>votes_funny</th>\n",
" <th>weighted_vote_score</th>\n",
" <th>written_during_early_access</th>\n",
" <th>comment_count</th>\n",
" <th>steam_purchase</th>\n",
" <th>received_for_free</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>149400957</td>\n",
" <td>english</td>\n",
" <td>Apparently, my favorite colour is red. Astario...</td>\n",
" <td>1698960336</td>\n",
" <td>1698960336</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>150084129</td>\n",
" <td>english</td>\n",
" <td>I literally got a new 1TB SSD to play this aft...</td>\n",
" <td>1699913072</td>\n",
" <td>1699913072</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>148459881</td>\n",
" <td>english</td>\n",
" <td>An incredible game, Bg3 has incredible graphic...</td>\n",
" <td>1697668891</td>\n",
" <td>1697668891</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>79014545</td>\n",
" <td>english</td>\n",
" <td>good</td>\n",
" <td>1604953762</td>\n",
" <td>1691397976</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>144059171</td>\n",
" <td>english</td>\n",
" <td>I havent been this invested into a game since ...</td>\n",
" <td>1691902245</td>\n",
" <td>1691902245</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1995</th>\n",
" <td>77854487</td>\n",
" <td>english</td>\n",
" <td>The graphics, game play, and voice acting are ...</td>\n",
" <td>1603168676</td>\n",
" <td>1634929644</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996</th>\n",
" <td>148174826</td>\n",
" <td>english</td>\n",
" <td>I,m very new to a game like this but i already...</td>\n",
" <td>1697253758</td>\n",
" <td>1697253758</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1997</th>\n",
" <td>149257034</td>\n",
" <td>english</td>\n",
" <td>A real work of art, tremendous amounts of deta...</td>\n",
" <td>1698765718</td>\n",
" <td>1698765718</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1998</th>\n",
" <td>121727958</td>\n",
" <td>english</td>\n",
" <td>Game is still in flux, but I trust the develop...</td>\n",
" <td>1662289547</td>\n",
" <td>1662289547</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1999</th>\n",
" <td>144358746</td>\n",
" <td>english</td>\n",
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁...</td>\n",
" <td>1692216013</td>\n",
" <td>1692216013</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.52381</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" recommendationid language \\\n",
"0 149400957 english \n",
"1 150084129 english \n",
"2 148459881 english \n",
"3 79014545 english \n",
"4 144059171 english \n",
"... ... ... \n",
"1995 77854487 english \n",
"1996 148174826 english \n",
"1997 149257034 english \n",
"1998 121727958 english \n",
"1999 144358746 english \n",
"\n",
" review timestamp_created \\\n",
"0 Apparently, my favorite colour is red. Astario... 1698960336 \n",
"1 I literally got a new 1TB SSD to play this aft... 1699913072 \n",
"2 An incredible game, Bg3 has incredible graphic... 1697668891 \n",
"3 good 1604953762 \n",
"4 I havent been this invested into a game since ... 1691902245 \n",
"... ... ... \n",
"1995 The graphics, game play, and voice acting are ... 1603168676 \n",
"1996 I,m very new to a game like this but i already... 1697253758 \n",
"1997 A real work of art, tremendous amounts of deta... 1698765718 \n",
"1998 Game is still in flux, but I trust the develop... 1662289547 \n",
"1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁... 1692216013 \n",
"\n",
" timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
"0 1698960336 True 1 0 0.00000 \n",
"1 1699913072 True 1 0 0.00000 \n",
"2 1697668891 True 0 0 0.00000 \n",
"3 1691397976 True 0 0 0.00000 \n",
"4 1691902245 True 0 0 0.00000 \n",
"... ... ... ... ... ... \n",
"1995 1634929644 True 0 0 0.00000 \n",
"1996 1697253758 True 0 0 0.00000 \n",
"1997 1698765718 True 0 0 0.00000 \n",
"1998 1662289547 True 0 0 0.00000 \n",
"1999 1692216013 True 1 0 0.52381 \n",
"\n",
" written_during_early_access comment_count steam_purchase \\\n",
"0 False 0 True \n",
"1 False 0 True \n",
"2 False 0 False \n",
"3 True 0 True \n",
"4 False 0 True \n",
"... ... ... ... \n",
"1995 True 0 True \n",
"1996 False 0 True \n",
"1997 False 0 True \n",
"1998 True 0 True \n",
"1999 False 0 True \n",
"\n",
" received_for_free \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"... ... \n",
"1995 False \n",
"1996 False \n",
"1997 False \n",
"1998 False \n",
"1999 False \n",
"\n",
"[2000 rows x 13 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from natasha import (\n",
" Segmenter,\n",
" MorphVocab,\n",
" NewsEmbedding,\n",
" NewsMorphTagger,\n",
" Doc\n",
")\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from string import punctuation\n",
"from nltk.tokenize import word_tokenize\n",
"# Закоментил, т.к выполнять это нужно всего 1 раз\n",
"# nltk.download('punkt')\n",
"# nltk.download('stopwords')\n",
"# nltk.download('punkt_tab')\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import confusion_matrix\n",
"from bs4 import BeautifulSoup\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"df = pd.read_csv('sample-module-v.csv')\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "72e59a17-88fe-47f5-bf1c-7ab49dbb1761",
"metadata": {},
"outputs": [],
"source": [
"df['review'] = df['review'].fillna('-------------------------------------------------------------------------------------------------')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "cf36326a-5b93-4d29-b420-711e0f323aa6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>language</th>\n",
" <th>review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>english</td>\n",
" <td>Apparently, my favorite colour is red. Astario...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>english</td>\n",
" <td>I literally got a new 1TB SSD to play this aft...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>english</td>\n",
" <td>An incredible game, Bg3 has incredible graphic...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>english</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>english</td>\n",
" <td>I havent been this invested into a game since ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1995</th>\n",
" <td>english</td>\n",
" <td>The graphics, game play, and voice acting are ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996</th>\n",
" <td>english</td>\n",
" <td>I,m very new to a game like this but i already...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1997</th>\n",
" <td>english</td>\n",
" <td>A real work of art, tremendous amounts of deta...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1998</th>\n",
" <td>english</td>\n",
" <td>Game is still in flux, but I trust the develop...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1999</th>\n",
" <td>english</td>\n",
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" language review\n",
"0 english Apparently, my favorite colour is red. Astario...\n",
"1 english I literally got a new 1TB SSD to play this aft...\n",
"2 english An incredible game, Bg3 has incredible graphic...\n",
"3 english good\n",
"4 english I havent been this invested into a game since ...\n",
"... ... ...\n",
"1995 english The graphics, game play, and voice acting are ...\n",
"1996 english I,m very new to a game like this but i already...\n",
"1997 english A real work of art, tremendous amounts of deta...\n",
"1998 english Game is still in flux, but I trust the develop...\n",
"1999 english ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁...\n",
"\n",
"[2000 rows x 2 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"object_cols = df.select_dtypes('object')\n",
"object_cols"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d3044cb9-649c-407c-b1ff-35a8ef4688ba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2000 entries, 0 to 1999\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 recommendationid 2000 non-null int64 \n",
" 1 language 2000 non-null object \n",
" 2 review 2000 non-null object \n",
" 3 timestamp_created 2000 non-null int64 \n",
" 4 timestamp_updated 2000 non-null int64 \n",
" 5 voted_up 2000 non-null bool \n",
" 6 votes_up 2000 non-null int64 \n",
" 7 votes_funny 2000 non-null int64 \n",
" 8 weighted_vote_score 2000 non-null float64\n",
" 9 written_during_early_access 2000 non-null bool \n",
" 10 comment_count 2000 non-null int64 \n",
" 11 steam_purchase 2000 non-null bool \n",
" 12 received_for_free 2000 non-null bool \n",
"dtypes: bool(4), float64(1), int64(6), object(2)\n",
"memory usage: 148.6+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "547cc03f-f889-4d63-838c-b84ee566a4e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"weighted_vote_score\n",
"0.000000 1539\n",
"0.523810 155\n",
"0.476190 26\n",
"0.545455 25\n",
"0.521739 18\n",
" ... \n",
"0.503801 1\n",
"0.510978 1\n",
"0.474167 1\n",
"0.501501 1\n",
"0.437956 1\n",
"Name: count, Length: 201, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['weighted_vote_score'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "05491808-33db-4c50-a957-1e9abb3fe1c0",
"metadata": {},
"outputs": [],
"source": [
"def strip_html_tags(html):\n",
" if pd.isna(html):\n",
" return html # Возвращаем NaN без изменений\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" return soup.get_text(separator=' ', strip=True)\n",
"\n",
"df['review'] = df['review'].apply(strip_html_tags)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8ce53318-f17d-4869-938a-f794be97378c",
"metadata": {},
"outputs": [],
"source": [
"df['review'] = df['review'].str.strip()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b5e1e5fa-6e02-4660-84b9-1aa072f33513",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>recommendationid</th>\n",
" <th>language</th>\n",
" <th>review</th>\n",
" <th>timestamp_created</th>\n",
" <th>timestamp_updated</th>\n",
" <th>voted_up</th>\n",
" <th>votes_up</th>\n",
" <th>votes_funny</th>\n",
" <th>weighted_vote_score</th>\n",
" <th>written_during_early_access</th>\n",
" <th>comment_count</th>\n",
" <th>steam_purchase</th>\n",
" <th>received_for_free</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>149400957</td>\n",
" <td>english</td>\n",
" <td>apparently, my favorite colour is red. astario...</td>\n",
" <td>1698960336</td>\n",
" <td>1698960336</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>150084129</td>\n",
" <td>english</td>\n",
" <td>i literally got a new 1tb ssd to play this aft...</td>\n",
" <td>1699913072</td>\n",
" <td>1699913072</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>148459881</td>\n",
" <td>english</td>\n",
" <td>an incredible game, bg3 has incredible graphic...</td>\n",
" <td>1697668891</td>\n",
" <td>1697668891</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>79014545</td>\n",
" <td>english</td>\n",
" <td>good</td>\n",
" <td>1604953762</td>\n",
" <td>1691397976</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>144059171</td>\n",
" <td>english</td>\n",
" <td>i havent been this invested into a game since ...</td>\n",
" <td>1691902245</td>\n",
" <td>1691902245</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1995</th>\n",
" <td>77854487</td>\n",
" <td>english</td>\n",
" <td>the graphics, game play, and voice acting are ...</td>\n",
" <td>1603168676</td>\n",
" <td>1634929644</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996</th>\n",
" <td>148174826</td>\n",
" <td>english</td>\n",
" <td>i,m very new to a game like this but i already...</td>\n",
" <td>1697253758</td>\n",
" <td>1697253758</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1997</th>\n",
" <td>149257034</td>\n",
" <td>english</td>\n",
" <td>a real work of art, tremendous amounts of deta...</td>\n",
" <td>1698765718</td>\n",
" <td>1698765718</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1998</th>\n",
" <td>121727958</td>\n",
" <td>english</td>\n",
" <td>game is still in flux, but i trust the develop...</td>\n",
" <td>1662289547</td>\n",
" <td>1662289547</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00000</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1999</th>\n",
" <td>144358746</td>\n",
" <td>english</td>\n",
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁...</td>\n",
" <td>1692216013</td>\n",
" <td>1692216013</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.52381</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" recommendationid language \\\n",
"0 149400957 english \n",
"1 150084129 english \n",
"2 148459881 english \n",
"3 79014545 english \n",
"4 144059171 english \n",
"... ... ... \n",
"1995 77854487 english \n",
"1996 148174826 english \n",
"1997 149257034 english \n",
"1998 121727958 english \n",
"1999 144358746 english \n",
"\n",
" review timestamp_created \\\n",
"0 apparently, my favorite colour is red. astario... 1698960336 \n",
"1 i literally got a new 1tb ssd to play this aft... 1699913072 \n",
"2 an incredible game, bg3 has incredible graphic... 1697668891 \n",
"3 good 1604953762 \n",
"4 i havent been this invested into a game since ... 1691902245 \n",
"... ... ... \n",
"1995 the graphics, game play, and voice acting are ... 1603168676 \n",
"1996 i,m very new to a game like this but i already... 1697253758 \n",
"1997 a real work of art, tremendous amounts of deta... 1698765718 \n",
"1998 game is still in flux, but i trust the develop... 1662289547 \n",
"1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁... 1692216013 \n",
"\n",
" timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
"0 1698960336 True 1 0 0.00000 \n",
"1 1699913072 True 1 0 0.00000 \n",
"2 1697668891 True 0 0 0.00000 \n",
"3 1691397976 True 0 0 0.00000 \n",
"4 1691902245 True 0 0 0.00000 \n",
"... ... ... ... ... ... \n",
"1995 1634929644 True 0 0 0.00000 \n",
"1996 1697253758 True 0 0 0.00000 \n",
"1997 1698765718 True 0 0 0.00000 \n",
"1998 1662289547 True 0 0 0.00000 \n",
"1999 1692216013 True 1 0 0.52381 \n",
"\n",
" written_during_early_access comment_count steam_purchase \\\n",
"0 False 0 True \n",
"1 False 0 True \n",
"2 False 0 False \n",
"3 True 0 True \n",
"4 False 0 True \n",
"... ... ... ... \n",
"1995 True 0 True \n",
"1996 False 0 True \n",
"1997 False 0 True \n",
"1998 True 0 True \n",
"1999 False 0 True \n",
"\n",
" received_for_free \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"... ... \n",
"1995 False \n",
"1996 False \n",
"1997 False \n",
"1998 False \n",
"1999 False \n",
"\n",
"[2000 rows x 13 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for col in object_cols:\n",
" df[col] = df[col].str.lower()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "0323f4c4-7985-4f4d-8306-e77b70f91a4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пример обработанных текстов:\n",
"0 apparently favorite colour red astarion red fl...\n",
"1 literally got new ssd play update worth idk st...\n",
"2 incredible game incredible graphics lovable co...\n",
"Name: processed_text, dtype: object\n",
"\n"
]
}
],
"source": [
"# Инициализация Natasha\n",
"segmenter = Segmenter()\n",
"morph_vocab = MorphVocab()\n",
"emb = NewsEmbedding()\n",
"morph_tagger = NewsMorphTagger(emb)\n",
"\n",
"# Предобработка текста\n",
"def preprocess_text(text):\n",
" # Улучшенное удаление спецсимволов (включая переносы строк)\n",
" text = re.sub(r'[^\\w\\s]|[\\d_]', ' ', text, flags=re.UNICODE)\n",
" text = re.sub(r'\\s+', ' ', text).strip().lower()\n",
" return text\n",
"\n",
"def lemmatize_text(text):\n",
" doc = Doc(text)\n",
" doc.segment(segmenter)\n",
" doc.tag_morph(morph_tagger)\n",
" for token in doc.tokens:\n",
" token.lemmatize(morph_vocab)\n",
" # Фильтрация стоп-слов и коротких лемм\n",
" return \" \".join([\n",
" token.lemma for token in doc.tokens \n",
" if token.lemma and len(token.lemma) > 2\n",
" and token.lemma not in stopwords.words('english')\n",
" ])\n",
"\n",
"# Применение обработки\n",
"df['review'] = df['review'].apply(preprocess_text)\n",
"df['processed_text'] = df['review'].apply(lemmatize_text)\n",
"\n",
"# Проверка данных после обработки\n",
"print(\"Пример обработанных текстов:\")\n",
"print(df['processed_text'].head(3), end='\\n\\n')\n",
"\n",
"# Векторизация\n",
"vectorizer = TfidfVectorizer(\n",
" max_features=1000,\n",
" ngram_range=(1, 3), \n",
" stop_words=stopwords.words('english'), # Добавление стоп-слов\n",
" min_df=5, # Игнорирование редких слов\n",
" max_df=0.7 # Игнорирование слишком частых слов\n",
")\n",
"tfidf_matrix = vectorizer.fit_transform(df[\"processed_text\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f45b55d5-663b-4b14-89e5-8cd1ba359252",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" tfidf_matrix, \n",
" df['votes_up'], # Убедитесь, что столбец с метками существует!\n",
" test_size=0.25,\n",
" random_state=42\n",
" )\n",
"except KeyError as e:\n",
" print(f\"Ошибка: {e} - проверьте наличие столбца с метками в данных!\")\n",
" exit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c905d1b-44b2-4f27-8cf4-c21a148ffe7d",
"metadata": {},
"outputs": [],
"source": [
"# 7. Настройка модели с GridSearch\n",
"param_grid = {\n",
" 'C': [0.1, 1, 10],\n",
" 'penalty': ['l1', 'l2'],\n",
" 'solver': ['liblinear', 'saga']\n",
"}\n",
"\n",
"logreg = LogisticRegression(max_iter=1000)\n",
"grid_search = GridSearchCV(logreg, param_grid, cv=5, n_jobs=-1)\n",
"grid_search.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bce774f-94a3-4f01-9213-3d208c8c9300",
"metadata": {},
"outputs": [],
"source": [
"# 8. Оценка модели\n",
"best_model = grid_search.best_estimator_\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(\"\\nЛучшие параметры модели:\", grid_search.best_params_)\n",
"print(\"\\nClassification Report:\")\n",
"print(classification_report(y_test, y_pred))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cbcc13f-b585-4a34-abab-dfc320683281",
"metadata": {},
"outputs": [],
"source": [
"# 9. Визуализация матрицы ошибок\n",
"sns.heatmap(\n",
" confusion_matrix(y_test, y_pred),\n",
" annot=True,\n",
" fmt='d',\n",
" cmap='Blues',\n",
" xticklabels=best_model.classes_,\n",
" yticklabels=best_model.classes_\n",
")\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('True')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b3f9fa6-d044-410e-94e9-57a468e6f3eb",
"metadata": {},
"outputs": [],
"source": [
"# 10. Анализ важности признаков\n",
"feature_names = vectorizer.get_feature_names_out()\n",
"coefficients = best_model.coef_[0]\n",
"top_features = np.argsort(coefficients)[-20:]\n",
"\n",
"print(\"\\nТоп-20 значимых признаков:\")\n",
"for idx in top_features:\n",
" print(f\"{feature_names[idx]}: {coefficients[idx]:.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4eb3a05-e84f-4576-b5ed-0cec03f1f787",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}