1109 lines
41 KiB
Plaintext
1109 lines
41 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "d792e3b7-4590-4545-a343-2c9ee606f50e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: natasha in c:\\programdata\\anaconda3\\lib\\site-packages (1.6.0)\n",
|
||
"Requirement already satisfied: pymorphy2 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.1)\n",
|
||
"Requirement already satisfied: razdel>=0.5.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.5.0)\n",
|
||
"Requirement already satisfied: navec>=0.9.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.10.0)\n",
|
||
"Requirement already satisfied: slovnet>=0.6.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.6.0)\n",
|
||
"Requirement already satisfied: yargy>=0.16.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.16.0)\n",
|
||
"Requirement already satisfied: ipymarkup>=0.8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.0)\n",
|
||
"Requirement already satisfied: intervaltree>=3 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipymarkup>=0.8.0->natasha) (3.1.0)\n",
|
||
"Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from navec>=0.9.0->natasha) (1.26.4)\n",
|
||
"Requirement already satisfied: dawg-python>=0.7.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.7.2)\n",
|
||
"Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (2.4.417127.4579844)\n",
|
||
"Requirement already satisfied: docopt>=0.6 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.6.2)\n",
|
||
"Requirement already satisfied: sortedcontainers<3.0,>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from intervaltree>=3->ipymarkup>=0.8.0->natasha) (2.4.0)\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# устанавливаем библиотеку natasha для дальнейших работ\n",
|
||
"pip install natasha"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "c6a64528-8064-41aa-83f1-591096e03080",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: bs4 in c:\\programdata\\anaconda3\\lib\\site-packages (0.0.2)\n",
|
||
"Requirement already satisfied: beautifulsoup4 in c:\\programdata\\anaconda3\\lib\\site-packages (from bs4) (4.12.3)\n",
|
||
"Requirement already satisfied: soupsieve>1.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from beautifulsoup4->bs4) (2.5)\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# устанавливаем библиотеку bs4 (Beautiful Soup 4) для дальнейших работ\n",
|
||
"pip install bs4"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "407b9639-427f-4e2d-98a0-bafc0bba45f6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>recommendationid</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>review</th>\n",
|
||
" <th>timestamp_created</th>\n",
|
||
" <th>timestamp_updated</th>\n",
|
||
" <th>voted_up</th>\n",
|
||
" <th>votes_up</th>\n",
|
||
" <th>votes_funny</th>\n",
|
||
" <th>weighted_vote_score</th>\n",
|
||
" <th>written_during_early_access</th>\n",
|
||
" <th>comment_count</th>\n",
|
||
" <th>steam_purchase</th>\n",
|
||
" <th>received_for_free</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>149400957</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>Apparently, my favorite colour is red. Astario...</td>\n",
|
||
" <td>1698960336</td>\n",
|
||
" <td>1698960336</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>150084129</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I literally got a new 1TB SSD to play this aft...</td>\n",
|
||
" <td>1699913072</td>\n",
|
||
" <td>1699913072</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>148459881</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>An incredible game, Bg3 has incredible graphic...</td>\n",
|
||
" <td>1697668891</td>\n",
|
||
" <td>1697668891</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>79014545</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>good</td>\n",
|
||
" <td>1604953762</td>\n",
|
||
" <td>1691397976</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>144059171</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I havent been this invested into a game since ...</td>\n",
|
||
" <td>1691902245</td>\n",
|
||
" <td>1691902245</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1995</th>\n",
|
||
" <td>77854487</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>The graphics, game play, and voice acting are ...</td>\n",
|
||
" <td>1603168676</td>\n",
|
||
" <td>1634929644</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1996</th>\n",
|
||
" <td>148174826</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I,m very new to a game like this but i already...</td>\n",
|
||
" <td>1697253758</td>\n",
|
||
" <td>1697253758</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1997</th>\n",
|
||
" <td>149257034</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>A real work of art, tremendous amounts of deta...</td>\n",
|
||
" <td>1698765718</td>\n",
|
||
" <td>1698765718</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1998</th>\n",
|
||
" <td>121727958</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>Game is still in flux, but I trust the develop...</td>\n",
|
||
" <td>1662289547</td>\n",
|
||
" <td>1662289547</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1999</th>\n",
|
||
" <td>144358746</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...</td>\n",
|
||
" <td>1692216013</td>\n",
|
||
" <td>1692216013</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.52381</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 13 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" recommendationid language \\\n",
|
||
"0 149400957 english \n",
|
||
"1 150084129 english \n",
|
||
"2 148459881 english \n",
|
||
"3 79014545 english \n",
|
||
"4 144059171 english \n",
|
||
"... ... ... \n",
|
||
"1995 77854487 english \n",
|
||
"1996 148174826 english \n",
|
||
"1997 149257034 english \n",
|
||
"1998 121727958 english \n",
|
||
"1999 144358746 english \n",
|
||
"\n",
|
||
" review timestamp_created \\\n",
|
||
"0 Apparently, my favorite colour is red. Astario... 1698960336 \n",
|
||
"1 I literally got a new 1TB SSD to play this aft... 1699913072 \n",
|
||
"2 An incredible game, Bg3 has incredible graphic... 1697668891 \n",
|
||
"3 good 1604953762 \n",
|
||
"4 I havent been this invested into a game since ... 1691902245 \n",
|
||
"... ... ... \n",
|
||
"1995 The graphics, game play, and voice acting are ... 1603168676 \n",
|
||
"1996 I,m very new to a game like this but i already... 1697253758 \n",
|
||
"1997 A real work of art, tremendous amounts of deta... 1698765718 \n",
|
||
"1998 Game is still in flux, but I trust the develop... 1662289547 \n",
|
||
"1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n",
|
||
"\n",
|
||
" timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
|
||
"0 1698960336 True 1 0 0.00000 \n",
|
||
"1 1699913072 True 1 0 0.00000 \n",
|
||
"2 1697668891 True 0 0 0.00000 \n",
|
||
"3 1691397976 True 0 0 0.00000 \n",
|
||
"4 1691902245 True 0 0 0.00000 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"1995 1634929644 True 0 0 0.00000 \n",
|
||
"1996 1697253758 True 0 0 0.00000 \n",
|
||
"1997 1698765718 True 0 0 0.00000 \n",
|
||
"1998 1662289547 True 0 0 0.00000 \n",
|
||
"1999 1692216013 True 1 0 0.52381 \n",
|
||
"\n",
|
||
" written_during_early_access comment_count steam_purchase \\\n",
|
||
"0 False 0 True \n",
|
||
"1 False 0 True \n",
|
||
"2 False 0 False \n",
|
||
"3 True 0 True \n",
|
||
"4 False 0 True \n",
|
||
"... ... ... ... \n",
|
||
"1995 True 0 True \n",
|
||
"1996 False 0 True \n",
|
||
"1997 False 0 True \n",
|
||
"1998 True 0 True \n",
|
||
"1999 False 0 True \n",
|
||
"\n",
|
||
" received_for_free \n",
|
||
"0 False \n",
|
||
"1 False \n",
|
||
"2 False \n",
|
||
"3 False \n",
|
||
"4 False \n",
|
||
"... ... \n",
|
||
"1995 False \n",
|
||
"1996 False \n",
|
||
"1997 False \n",
|
||
"1998 False \n",
|
||
"1999 False \n",
|
||
"\n",
|
||
"[2000 rows x 13 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import re\n",
|
||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
"from natasha import (\n",
|
||
" Segmenter,\n",
|
||
" MorphVocab,\n",
|
||
" NewsEmbedding,\n",
|
||
" NewsMorphTagger,\n",
|
||
" Doc\n",
|
||
")\n",
|
||
"import nltk\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"from string import punctuation\n",
|
||
"from nltk.tokenize import word_tokenize\n",
|
||
"# Закоментил, т.к выполнять это нужно всего 1 раз\n",
|
||
"# nltk.download('punkt')\n",
|
||
"# nltk.download('stopwords')\n",
|
||
"# nltk.download('punkt_tab')\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.model_selection import GridSearchCV\n",
|
||
"from sklearn.metrics import classification_report\n",
|
||
"from sklearn.metrics import confusion_matrix\n",
|
||
"from bs4 import BeautifulSoup\n",
|
||
"import warnings\n",
|
||
"warnings.filterwarnings('ignore')\n",
|
||
"\n",
|
||
"df = pd.read_csv('sample-module-v.csv')\n",
|
||
"\n",
|
||
"df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "72e59a17-88fe-47f5-bf1c-7ab49dbb1761",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df['review'] = df['review'].fillna('-------------------------------------------------------------------------------------------------')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "cf36326a-5b93-4d29-b420-711e0f323aa6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>review</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>Apparently, my favorite colour is red. Astario...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I literally got a new 1TB SSD to play this aft...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>An incredible game, Bg3 has incredible graphic...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>good</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I havent been this invested into a game since ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1995</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>The graphics, game play, and voice acting are ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1996</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>I,m very new to a game like this but i already...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1997</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>A real work of art, tremendous amounts of deta...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1998</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>Game is still in flux, but I trust the develop...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1999</th>\n",
|
||
" <td>english</td>\n",
|
||
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" language review\n",
|
||
"0 english Apparently, my favorite colour is red. Astario...\n",
|
||
"1 english I literally got a new 1TB SSD to play this aft...\n",
|
||
"2 english An incredible game, Bg3 has incredible graphic...\n",
|
||
"3 english good\n",
|
||
"4 english I havent been this invested into a game since ...\n",
|
||
"... ... ...\n",
|
||
"1995 english The graphics, game play, and voice acting are ...\n",
|
||
"1996 english I,m very new to a game like this but i already...\n",
|
||
"1997 english A real work of art, tremendous amounts of deta...\n",
|
||
"1998 english Game is still in flux, but I trust the develop...\n",
|
||
"1999 english ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...\n",
|
||
"\n",
|
||
"[2000 rows x 2 columns]"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"object_cols = df.select_dtypes('object')\n",
|
||
"object_cols"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "d3044cb9-649c-407c-b1ff-35a8ef4688ba",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 2000 entries, 0 to 1999\n",
|
||
"Data columns (total 13 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 recommendationid 2000 non-null int64 \n",
|
||
" 1 language 2000 non-null object \n",
|
||
" 2 review 2000 non-null object \n",
|
||
" 3 timestamp_created 2000 non-null int64 \n",
|
||
" 4 timestamp_updated 2000 non-null int64 \n",
|
||
" 5 voted_up 2000 non-null bool \n",
|
||
" 6 votes_up 2000 non-null int64 \n",
|
||
" 7 votes_funny 2000 non-null int64 \n",
|
||
" 8 weighted_vote_score 2000 non-null float64\n",
|
||
" 9 written_during_early_access 2000 non-null bool \n",
|
||
" 10 comment_count 2000 non-null int64 \n",
|
||
" 11 steam_purchase 2000 non-null bool \n",
|
||
" 12 received_for_free 2000 non-null bool \n",
|
||
"dtypes: bool(4), float64(1), int64(6), object(2)\n",
|
||
"memory usage: 148.6+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "547cc03f-f889-4d63-838c-b84ee566a4e1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"weighted_vote_score\n",
|
||
"0.000000 1539\n",
|
||
"0.523810 155\n",
|
||
"0.476190 26\n",
|
||
"0.545455 25\n",
|
||
"0.521739 18\n",
|
||
" ... \n",
|
||
"0.503801 1\n",
|
||
"0.510978 1\n",
|
||
"0.474167 1\n",
|
||
"0.501501 1\n",
|
||
"0.437956 1\n",
|
||
"Name: count, Length: 201, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df['weighted_vote_score'].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "05491808-33db-4c50-a957-1e9abb3fe1c0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def strip_html_tags(html):\n",
|
||
" if pd.isna(html):\n",
|
||
" return html # Возвращаем NaN без изменений\n",
|
||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||
" return soup.get_text(separator=' ', strip=True)\n",
|
||
"\n",
|
||
"df['review'] = df['review'].apply(strip_html_tags)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "8ce53318-f17d-4869-938a-f794be97378c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df['review'] = df['review'].str.strip()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "b5e1e5fa-6e02-4660-84b9-1aa072f33513",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>recommendationid</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>review</th>\n",
|
||
" <th>timestamp_created</th>\n",
|
||
" <th>timestamp_updated</th>\n",
|
||
" <th>voted_up</th>\n",
|
||
" <th>votes_up</th>\n",
|
||
" <th>votes_funny</th>\n",
|
||
" <th>weighted_vote_score</th>\n",
|
||
" <th>written_during_early_access</th>\n",
|
||
" <th>comment_count</th>\n",
|
||
" <th>steam_purchase</th>\n",
|
||
" <th>received_for_free</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>149400957</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>apparently, my favorite colour is red. astario...</td>\n",
|
||
" <td>1698960336</td>\n",
|
||
" <td>1698960336</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>150084129</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>i literally got a new 1tb ssd to play this aft...</td>\n",
|
||
" <td>1699913072</td>\n",
|
||
" <td>1699913072</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>148459881</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>an incredible game, bg3 has incredible graphic...</td>\n",
|
||
" <td>1697668891</td>\n",
|
||
" <td>1697668891</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>79014545</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>good</td>\n",
|
||
" <td>1604953762</td>\n",
|
||
" <td>1691397976</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>144059171</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>i havent been this invested into a game since ...</td>\n",
|
||
" <td>1691902245</td>\n",
|
||
" <td>1691902245</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1995</th>\n",
|
||
" <td>77854487</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>the graphics, game play, and voice acting are ...</td>\n",
|
||
" <td>1603168676</td>\n",
|
||
" <td>1634929644</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1996</th>\n",
|
||
" <td>148174826</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>i,m very new to a game like this but i already...</td>\n",
|
||
" <td>1697253758</td>\n",
|
||
" <td>1697253758</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1997</th>\n",
|
||
" <td>149257034</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>a real work of art, tremendous amounts of deta...</td>\n",
|
||
" <td>1698765718</td>\n",
|
||
" <td>1698765718</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1998</th>\n",
|
||
" <td>121727958</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>game is still in flux, but i trust the develop...</td>\n",
|
||
" <td>1662289547</td>\n",
|
||
" <td>1662289547</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00000</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1999</th>\n",
|
||
" <td>144358746</td>\n",
|
||
" <td>english</td>\n",
|
||
" <td>⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...</td>\n",
|
||
" <td>1692216013</td>\n",
|
||
" <td>1692216013</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.52381</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 13 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" recommendationid language \\\n",
|
||
"0 149400957 english \n",
|
||
"1 150084129 english \n",
|
||
"2 148459881 english \n",
|
||
"3 79014545 english \n",
|
||
"4 144059171 english \n",
|
||
"... ... ... \n",
|
||
"1995 77854487 english \n",
|
||
"1996 148174826 english \n",
|
||
"1997 149257034 english \n",
|
||
"1998 121727958 english \n",
|
||
"1999 144358746 english \n",
|
||
"\n",
|
||
" review timestamp_created \\\n",
|
||
"0 apparently, my favorite colour is red. astario... 1698960336 \n",
|
||
"1 i literally got a new 1tb ssd to play this aft... 1699913072 \n",
|
||
"2 an incredible game, bg3 has incredible graphic... 1697668891 \n",
|
||
"3 good 1604953762 \n",
|
||
"4 i havent been this invested into a game since ... 1691902245 \n",
|
||
"... ... ... \n",
|
||
"1995 the graphics, game play, and voice acting are ... 1603168676 \n",
|
||
"1996 i,m very new to a game like this but i already... 1697253758 \n",
|
||
"1997 a real work of art, tremendous amounts of deta... 1698765718 \n",
|
||
"1998 game is still in flux, but i trust the develop... 1662289547 \n",
|
||
"1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n",
|
||
"\n",
|
||
" timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
|
||
"0 1698960336 True 1 0 0.00000 \n",
|
||
"1 1699913072 True 1 0 0.00000 \n",
|
||
"2 1697668891 True 0 0 0.00000 \n",
|
||
"3 1691397976 True 0 0 0.00000 \n",
|
||
"4 1691902245 True 0 0 0.00000 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"1995 1634929644 True 0 0 0.00000 \n",
|
||
"1996 1697253758 True 0 0 0.00000 \n",
|
||
"1997 1698765718 True 0 0 0.00000 \n",
|
||
"1998 1662289547 True 0 0 0.00000 \n",
|
||
"1999 1692216013 True 1 0 0.52381 \n",
|
||
"\n",
|
||
" written_during_early_access comment_count steam_purchase \\\n",
|
||
"0 False 0 True \n",
|
||
"1 False 0 True \n",
|
||
"2 False 0 False \n",
|
||
"3 True 0 True \n",
|
||
"4 False 0 True \n",
|
||
"... ... ... ... \n",
|
||
"1995 True 0 True \n",
|
||
"1996 False 0 True \n",
|
||
"1997 False 0 True \n",
|
||
"1998 True 0 True \n",
|
||
"1999 False 0 True \n",
|
||
"\n",
|
||
" received_for_free \n",
|
||
"0 False \n",
|
||
"1 False \n",
|
||
"2 False \n",
|
||
"3 False \n",
|
||
"4 False \n",
|
||
"... ... \n",
|
||
"1995 False \n",
|
||
"1996 False \n",
|
||
"1997 False \n",
|
||
"1998 False \n",
|
||
"1999 False \n",
|
||
"\n",
|
||
"[2000 rows x 13 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"for col in object_cols:\n",
|
||
" df[col] = df[col].str.lower()\n",
|
||
"df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "0323f4c4-7985-4f4d-8306-e77b70f91a4b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пример обработанных текстов:\n",
|
||
"0 apparently favorite colour red astarion red fl...\n",
|
||
"1 literally got new ssd play update worth idk st...\n",
|
||
"2 incredible game incredible graphics lovable co...\n",
|
||
"Name: processed_text, dtype: object\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Инициализация Natasha\n",
|
||
"segmenter = Segmenter()\n",
|
||
"morph_vocab = MorphVocab()\n",
|
||
"emb = NewsEmbedding()\n",
|
||
"morph_tagger = NewsMorphTagger(emb)\n",
|
||
"\n",
|
||
"# Предобработка текста\n",
|
||
"def preprocess_text(text):\n",
|
||
" # Улучшенное удаление спецсимволов (включая переносы строк)\n",
|
||
" text = re.sub(r'[^\\w\\s]|[\\d_]', ' ', text, flags=re.UNICODE)\n",
|
||
" text = re.sub(r'\\s+', ' ', text).strip().lower()\n",
|
||
" return text\n",
|
||
"\n",
|
||
"def lemmatize_text(text):\n",
|
||
" doc = Doc(text)\n",
|
||
" doc.segment(segmenter)\n",
|
||
" doc.tag_morph(morph_tagger)\n",
|
||
" for token in doc.tokens:\n",
|
||
" token.lemmatize(morph_vocab)\n",
|
||
" # Фильтрация стоп-слов и коротких лемм\n",
|
||
" return \" \".join([\n",
|
||
" token.lemma for token in doc.tokens \n",
|
||
" if token.lemma and len(token.lemma) > 2\n",
|
||
" and token.lemma not in stopwords.words('english')\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Применение обработки\n",
|
||
"df['review'] = df['review'].apply(preprocess_text)\n",
|
||
"df['processed_text'] = df['review'].apply(lemmatize_text)\n",
|
||
"\n",
|
||
"# Проверка данных после обработки\n",
|
||
"print(\"Пример обработанных текстов:\")\n",
|
||
"print(df['processed_text'].head(3), end='\\n\\n')\n",
|
||
"\n",
|
||
"# Векторизация\n",
|
||
"vectorizer = TfidfVectorizer(\n",
|
||
" max_features=1000,\n",
|
||
" ngram_range=(1, 3), \n",
|
||
" stop_words=stopwords.words('english'), # Добавление стоп-слов\n",
|
||
" min_df=5, # Игнорирование редких слов\n",
|
||
" max_df=0.7 # Игнорирование слишком частых слов\n",
|
||
")\n",
|
||
"tfidf_matrix = vectorizer.fit_transform(df[\"processed_text\"])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f45b55d5-663b-4b14-89e5-8cd1ba359252",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"try:\n",
|
||
" X_train, X_test, y_train, y_test = train_test_split(\n",
|
||
" tfidf_matrix, \n",
|
||
" df['votes_up'], # Убедитесь, что столбец с метками существует!\n",
|
||
" test_size=0.25,\n",
|
||
" random_state=42\n",
|
||
" )\n",
|
||
"except KeyError as e:\n",
|
||
" print(f\"Ошибка: {e} - проверьте наличие столбца с метками в данных!\")\n",
|
||
" exit()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8c905d1b-44b2-4f27-8cf4-c21a148ffe7d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 7. Настройка модели с GridSearch\n",
|
||
"param_grid = {\n",
|
||
" 'C': [0.1, 1, 10],\n",
|
||
" 'penalty': ['l1', 'l2'],\n",
|
||
" 'solver': ['liblinear', 'saga']\n",
|
||
"}\n",
|
||
"\n",
|
||
"logreg = LogisticRegression(max_iter=1000)\n",
|
||
"grid_search = GridSearchCV(logreg, param_grid, cv=5, n_jobs=-1)\n",
|
||
"grid_search.fit(X_train, y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6bce774f-94a3-4f01-9213-3d208c8c9300",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 8. Оценка модели\n",
|
||
"best_model = grid_search.best_estimator_\n",
|
||
"y_pred = best_model.predict(X_test)\n",
|
||
"\n",
|
||
"print(\"\\nЛучшие параметры модели:\", grid_search.best_params_)\n",
|
||
"print(\"\\nClassification Report:\")\n",
|
||
"print(classification_report(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "0cbcc13f-b585-4a34-abab-dfc320683281",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 9. Визуализация матрицы ошибок\n",
|
||
"sns.heatmap(\n",
|
||
" confusion_matrix(y_test, y_pred),\n",
|
||
" annot=True,\n",
|
||
" fmt='d',\n",
|
||
" cmap='Blues',\n",
|
||
" xticklabels=best_model.classes_,\n",
|
||
" yticklabels=best_model.classes_\n",
|
||
")\n",
|
||
"plt.xlabel('Predicted')\n",
|
||
"plt.ylabel('True')\n",
|
||
"plt.title('Confusion Matrix')\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "3b3f9fa6-d044-410e-94e9-57a468e6f3eb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 10. Анализ важности признаков\n",
|
||
"feature_names = vectorizer.get_feature_names_out()\n",
|
||
"coefficients = best_model.coef_[0]\n",
|
||
"top_features = np.argsort(coefficients)[-20:]\n",
|
||
"\n",
|
||
"print(\"\\nТоп-20 значимых признаков:\")\n",
|
||
"for idx in top_features:\n",
|
||
" print(f\"{feature_names[idx]}: {coefficients[idx]:.3f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a4eb3a05-e84f-4576-b5ed-0cec03f1f787",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|