diff --git a/module V-checkpoint.ipynb b/module V-checkpoint.ipynb
deleted file mode 100644
index f5c968c..0000000
--- a/module V-checkpoint.ipynb
+++ /dev/null
@@ -1,1108 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "d792e3b7-4590-4545-a343-2c9ee606f50e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: natasha in c:\\programdata\\anaconda3\\lib\\site-packages (1.6.0)\n",
- "Requirement already satisfied: pymorphy2 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.1)\n",
- "Requirement already satisfied: razdel>=0.5.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.5.0)\n",
- "Requirement already satisfied: navec>=0.9.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.10.0)\n",
- "Requirement already satisfied: slovnet>=0.6.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.6.0)\n",
- "Requirement already satisfied: yargy>=0.16.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.16.0)\n",
- "Requirement already satisfied: ipymarkup>=0.8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.0)\n",
- "Requirement already satisfied: intervaltree>=3 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipymarkup>=0.8.0->natasha) (3.1.0)\n",
- "Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from navec>=0.9.0->natasha) (1.26.4)\n",
- "Requirement already satisfied: dawg-python>=0.7.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.7.2)\n",
- "Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (2.4.417127.4579844)\n",
- "Requirement already satisfied: docopt>=0.6 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.6.2)\n",
- "Requirement already satisfied: sortedcontainers<3.0,>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from intervaltree>=3->ipymarkup>=0.8.0->natasha) (2.4.0)\n",
- "Note: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "# устанавливаем библиотеку natasha для дальнейших работ\n",
- "pip install natasha"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "c6a64528-8064-41aa-83f1-591096e03080",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: bs4 in c:\\programdata\\anaconda3\\lib\\site-packages (0.0.2)\n",
- "Requirement already satisfied: beautifulsoup4 in c:\\programdata\\anaconda3\\lib\\site-packages (from bs4) (4.12.3)\n",
- "Requirement already satisfied: soupsieve>1.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from beautifulsoup4->bs4) (2.5)\n",
- "Note: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "# устанавливаем библиотеку bs4 (Beautiful Soup 4) для дальнейших работ\n",
- "pip install bs4"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "407b9639-427f-4e2d-98a0-bafc0bba45f6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " recommendationid | \n",
- " language | \n",
- " review | \n",
- " timestamp_created | \n",
- " timestamp_updated | \n",
- " voted_up | \n",
- " votes_up | \n",
- " votes_funny | \n",
- " weighted_vote_score | \n",
- " written_during_early_access | \n",
- " comment_count | \n",
- " steam_purchase | \n",
- " received_for_free | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 149400957 | \n",
- " english | \n",
- " Apparently, my favorite colour is red. Astario... | \n",
- " 1698960336 | \n",
- " 1698960336 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 150084129 | \n",
- " english | \n",
- " I literally got a new 1TB SSD to play this aft... | \n",
- " 1699913072 | \n",
- " 1699913072 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 148459881 | \n",
- " english | \n",
- " An incredible game, Bg3 has incredible graphic... | \n",
- " 1697668891 | \n",
- " 1697668891 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 79014545 | \n",
- " english | \n",
- " good | \n",
- " 1604953762 | \n",
- " 1691397976 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 144059171 | \n",
- " english | \n",
- " I havent been this invested into a game since ... | \n",
- " 1691902245 | \n",
- " 1691902245 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1995 | \n",
- " 77854487 | \n",
- " english | \n",
- " The graphics, game play, and voice acting are ... | \n",
- " 1603168676 | \n",
- " 1634929644 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1996 | \n",
- " 148174826 | \n",
- " english | \n",
- " I,m very new to a game like this but i already... | \n",
- " 1697253758 | \n",
- " 1697253758 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1997 | \n",
- " 149257034 | \n",
- " english | \n",
- " A real work of art, tremendous amounts of deta... | \n",
- " 1698765718 | \n",
- " 1698765718 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1998 | \n",
- " 121727958 | \n",
- " english | \n",
- " Game is still in flux, but I trust the develop... | \n",
- " 1662289547 | \n",
- " 1662289547 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1999 | \n",
- " 144358746 | \n",
- " english | \n",
- " ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... | \n",
- " 1692216013 | \n",
- " 1692216013 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.52381 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
2000 rows × 13 columns
\n",
- "
"
- ],
- "text/plain": [
- " recommendationid language \\\n",
- "0 149400957 english \n",
- "1 150084129 english \n",
- "2 148459881 english \n",
- "3 79014545 english \n",
- "4 144059171 english \n",
- "... ... ... \n",
- "1995 77854487 english \n",
- "1996 148174826 english \n",
- "1997 149257034 english \n",
- "1998 121727958 english \n",
- "1999 144358746 english \n",
- "\n",
- " review timestamp_created \\\n",
- "0 Apparently, my favorite colour is red. Astario... 1698960336 \n",
- "1 I literally got a new 1TB SSD to play this aft... 1699913072 \n",
- "2 An incredible game, Bg3 has incredible graphic... 1697668891 \n",
- "3 good 1604953762 \n",
- "4 I havent been this invested into a game since ... 1691902245 \n",
- "... ... ... \n",
- "1995 The graphics, game play, and voice acting are ... 1603168676 \n",
- "1996 I,m very new to a game like this but i already... 1697253758 \n",
- "1997 A real work of art, tremendous amounts of deta... 1698765718 \n",
- "1998 Game is still in flux, but I trust the develop... 1662289547 \n",
- "1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n",
- "\n",
- " timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
- "0 1698960336 True 1 0 0.00000 \n",
- "1 1699913072 True 1 0 0.00000 \n",
- "2 1697668891 True 0 0 0.00000 \n",
- "3 1691397976 True 0 0 0.00000 \n",
- "4 1691902245 True 0 0 0.00000 \n",
- "... ... ... ... ... ... \n",
- "1995 1634929644 True 0 0 0.00000 \n",
- "1996 1697253758 True 0 0 0.00000 \n",
- "1997 1698765718 True 0 0 0.00000 \n",
- "1998 1662289547 True 0 0 0.00000 \n",
- "1999 1692216013 True 1 0 0.52381 \n",
- "\n",
- " written_during_early_access comment_count steam_purchase \\\n",
- "0 False 0 True \n",
- "1 False 0 True \n",
- "2 False 0 False \n",
- "3 True 0 True \n",
- "4 False 0 True \n",
- "... ... ... ... \n",
- "1995 True 0 True \n",
- "1996 False 0 True \n",
- "1997 False 0 True \n",
- "1998 True 0 True \n",
- "1999 False 0 True \n",
- "\n",
- " received_for_free \n",
- "0 False \n",
- "1 False \n",
- "2 False \n",
- "3 False \n",
- "4 False \n",
- "... ... \n",
- "1995 False \n",
- "1996 False \n",
- "1997 False \n",
- "1998 False \n",
- "1999 False \n",
- "\n",
- "[2000 rows x 13 columns]"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import re\n",
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "from natasha import (\n",
- " Segmenter,\n",
- " MorphVocab,\n",
- " NewsEmbedding,\n",
- " NewsMorphTagger,\n",
- " Doc\n",
- ")\n",
- "import nltk\n",
- "from nltk.corpus import stopwords\n",
- "from string import punctuation\n",
- "from nltk.tokenize import word_tokenize\n",
- "# Закоментил, т.к выполнять это нужно всего 1 раз\n",
- "# nltk.download('punkt')\n",
- "# nltk.download('stopwords')\n",
- "# nltk.download('punkt_tab')\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "from sklearn.model_selection import GridSearchCV\n",
- "from sklearn.metrics import classification_report\n",
- "from sklearn.metrics import confusion_matrix\n",
- "from bs4 import BeautifulSoup\n",
- "import warnings\n",
- "warnings.filterwarnings('ignore')\n",
- "\n",
- "df = pd.read_csv('sample-module-v.csv')\n",
- "\n",
- "df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "72e59a17-88fe-47f5-bf1c-7ab49dbb1761",
- "metadata": {},
- "outputs": [],
- "source": [
- "df['review'] = df['review'].fillna('-------------------------------------------------------------------------------------------------')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "cf36326a-5b93-4d29-b420-711e0f323aa6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " language | \n",
- " review | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " english | \n",
- " Apparently, my favorite colour is red. Astario... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " english | \n",
- " I literally got a new 1TB SSD to play this aft... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " english | \n",
- " An incredible game, Bg3 has incredible graphic... | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " english | \n",
- " good | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " english | \n",
- " I havent been this invested into a game since ... | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1995 | \n",
- " english | \n",
- " The graphics, game play, and voice acting are ... | \n",
- "
\n",
- " \n",
- " 1996 | \n",
- " english | \n",
- " I,m very new to a game like this but i already... | \n",
- "
\n",
- " \n",
- " 1997 | \n",
- " english | \n",
- " A real work of art, tremendous amounts of deta... | \n",
- "
\n",
- " \n",
- " 1998 | \n",
- " english | \n",
- " Game is still in flux, but I trust the develop... | \n",
- "
\n",
- " \n",
- " 1999 | \n",
- " english | \n",
- " ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... | \n",
- "
\n",
- " \n",
- "
\n",
- "
2000 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " language review\n",
- "0 english Apparently, my favorite colour is red. Astario...\n",
- "1 english I literally got a new 1TB SSD to play this aft...\n",
- "2 english An incredible game, Bg3 has incredible graphic...\n",
- "3 english good\n",
- "4 english I havent been this invested into a game since ...\n",
- "... ... ...\n",
- "1995 english The graphics, game play, and voice acting are ...\n",
- "1996 english I,m very new to a game like this but i already...\n",
- "1997 english A real work of art, tremendous amounts of deta...\n",
- "1998 english Game is still in flux, but I trust the develop...\n",
- "1999 english ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...\n",
- "\n",
- "[2000 rows x 2 columns]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "object_cols = df.select_dtypes('object')\n",
- "object_cols"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "d3044cb9-649c-407c-b1ff-35a8ef4688ba",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 2000 entries, 0 to 1999\n",
- "Data columns (total 13 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 recommendationid 2000 non-null int64 \n",
- " 1 language 2000 non-null object \n",
- " 2 review 2000 non-null object \n",
- " 3 timestamp_created 2000 non-null int64 \n",
- " 4 timestamp_updated 2000 non-null int64 \n",
- " 5 voted_up 2000 non-null bool \n",
- " 6 votes_up 2000 non-null int64 \n",
- " 7 votes_funny 2000 non-null int64 \n",
- " 8 weighted_vote_score 2000 non-null float64\n",
- " 9 written_during_early_access 2000 non-null bool \n",
- " 10 comment_count 2000 non-null int64 \n",
- " 11 steam_purchase 2000 non-null bool \n",
- " 12 received_for_free 2000 non-null bool \n",
- "dtypes: bool(4), float64(1), int64(6), object(2)\n",
- "memory usage: 148.6+ KB\n"
- ]
- }
- ],
- "source": [
- "df.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "547cc03f-f889-4d63-838c-b84ee566a4e1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "weighted_vote_score\n",
- "0.000000 1539\n",
- "0.523810 155\n",
- "0.476190 26\n",
- "0.545455 25\n",
- "0.521739 18\n",
- " ... \n",
- "0.503801 1\n",
- "0.510978 1\n",
- "0.474167 1\n",
- "0.501501 1\n",
- "0.437956 1\n",
- "Name: count, Length: 201, dtype: int64"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df['weighted_vote_score'].value_counts()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "05491808-33db-4c50-a957-1e9abb3fe1c0",
- "metadata": {},
- "outputs": [],
- "source": [
- "def strip_html_tags(html):\n",
- " if pd.isna(html):\n",
- " return html # Возвращаем NaN без изменений\n",
- " soup = BeautifulSoup(html, 'html.parser')\n",
- " return soup.get_text(separator=' ', strip=True)\n",
- "\n",
- "df['review'] = df['review'].apply(strip_html_tags)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "8ce53318-f17d-4869-938a-f794be97378c",
- "metadata": {},
- "outputs": [],
- "source": [
- "df['review'] = df['review'].str.strip()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "b5e1e5fa-6e02-4660-84b9-1aa072f33513",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " recommendationid | \n",
- " language | \n",
- " review | \n",
- " timestamp_created | \n",
- " timestamp_updated | \n",
- " voted_up | \n",
- " votes_up | \n",
- " votes_funny | \n",
- " weighted_vote_score | \n",
- " written_during_early_access | \n",
- " comment_count | \n",
- " steam_purchase | \n",
- " received_for_free | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 149400957 | \n",
- " english | \n",
- " apparently, my favorite colour is red. astario... | \n",
- " 1698960336 | \n",
- " 1698960336 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 150084129 | \n",
- " english | \n",
- " i literally got a new 1tb ssd to play this aft... | \n",
- " 1699913072 | \n",
- " 1699913072 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 148459881 | \n",
- " english | \n",
- " an incredible game, bg3 has incredible graphic... | \n",
- " 1697668891 | \n",
- " 1697668891 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 79014545 | \n",
- " english | \n",
- " good | \n",
- " 1604953762 | \n",
- " 1691397976 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 144059171 | \n",
- " english | \n",
- " i havent been this invested into a game since ... | \n",
- " 1691902245 | \n",
- " 1691902245 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1995 | \n",
- " 77854487 | \n",
- " english | \n",
- " the graphics, game play, and voice acting are ... | \n",
- " 1603168676 | \n",
- " 1634929644 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1996 | \n",
- " 148174826 | \n",
- " english | \n",
- " i,m very new to a game like this but i already... | \n",
- " 1697253758 | \n",
- " 1697253758 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1997 | \n",
- " 149257034 | \n",
- " english | \n",
- " a real work of art, tremendous amounts of deta... | \n",
- " 1698765718 | \n",
- " 1698765718 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1998 | \n",
- " 121727958 | \n",
- " english | \n",
- " game is still in flux, but i trust the develop... | \n",
- " 1662289547 | \n",
- " 1662289547 | \n",
- " True | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.00000 | \n",
- " True | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1999 | \n",
- " 144358746 | \n",
- " english | \n",
- " ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... | \n",
- " 1692216013 | \n",
- " 1692216013 | \n",
- " True | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.52381 | \n",
- " False | \n",
- " 0 | \n",
- " True | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
2000 rows × 13 columns
\n",
- "
"
- ],
- "text/plain": [
- " recommendationid language \\\n",
- "0 149400957 english \n",
- "1 150084129 english \n",
- "2 148459881 english \n",
- "3 79014545 english \n",
- "4 144059171 english \n",
- "... ... ... \n",
- "1995 77854487 english \n",
- "1996 148174826 english \n",
- "1997 149257034 english \n",
- "1998 121727958 english \n",
- "1999 144358746 english \n",
- "\n",
- " review timestamp_created \\\n",
- "0 apparently, my favorite colour is red. astario... 1698960336 \n",
- "1 i literally got a new 1tb ssd to play this aft... 1699913072 \n",
- "2 an incredible game, bg3 has incredible graphic... 1697668891 \n",
- "3 good 1604953762 \n",
- "4 i havent been this invested into a game since ... 1691902245 \n",
- "... ... ... \n",
- "1995 the graphics, game play, and voice acting are ... 1603168676 \n",
- "1996 i,m very new to a game like this but i already... 1697253758 \n",
- "1997 a real work of art, tremendous amounts of deta... 1698765718 \n",
- "1998 game is still in flux, but i trust the develop... 1662289547 \n",
- "1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n",
- "\n",
- " timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n",
- "0 1698960336 True 1 0 0.00000 \n",
- "1 1699913072 True 1 0 0.00000 \n",
- "2 1697668891 True 0 0 0.00000 \n",
- "3 1691397976 True 0 0 0.00000 \n",
- "4 1691902245 True 0 0 0.00000 \n",
- "... ... ... ... ... ... \n",
- "1995 1634929644 True 0 0 0.00000 \n",
- "1996 1697253758 True 0 0 0.00000 \n",
- "1997 1698765718 True 0 0 0.00000 \n",
- "1998 1662289547 True 0 0 0.00000 \n",
- "1999 1692216013 True 1 0 0.52381 \n",
- "\n",
- " written_during_early_access comment_count steam_purchase \\\n",
- "0 False 0 True \n",
- "1 False 0 True \n",
- "2 False 0 False \n",
- "3 True 0 True \n",
- "4 False 0 True \n",
- "... ... ... ... \n",
- "1995 True 0 True \n",
- "1996 False 0 True \n",
- "1997 False 0 True \n",
- "1998 True 0 True \n",
- "1999 False 0 True \n",
- "\n",
- " received_for_free \n",
- "0 False \n",
- "1 False \n",
- "2 False \n",
- "3 False \n",
- "4 False \n",
- "... ... \n",
- "1995 False \n",
- "1996 False \n",
- "1997 False \n",
- "1998 False \n",
- "1999 False \n",
- "\n",
- "[2000 rows x 13 columns]"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "for col in object_cols:\n",
- " df[col] = df[col].str.lower()\n",
- "df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "0323f4c4-7985-4f4d-8306-e77b70f91a4b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Пример обработанных текстов:\n",
- "0 apparently favorite colour red astarion red fl...\n",
- "1 literally got new ssd play update worth idk st...\n",
- "2 incredible game incredible graphics lovable co...\n",
- "Name: processed_text, dtype: object\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Инициализация Natasha\n",
- "segmenter = Segmenter()\n",
- "morph_vocab = MorphVocab()\n",
- "emb = NewsEmbedding()\n",
- "morph_tagger = NewsMorphTagger(emb)\n",
- "\n",
- "# Предобработка текста\n",
- "def preprocess_text(text):\n",
- " # Улучшенное удаление спецсимволов (включая переносы строк)\n",
- " text = re.sub(r'[^\\w\\s]|[\\d_]', ' ', text, flags=re.UNICODE)\n",
- " text = re.sub(r'\\s+', ' ', text).strip().lower()\n",
- " return text\n",
- "\n",
- "def lemmatize_text(text):\n",
- " doc = Doc(text)\n",
- " doc.segment(segmenter)\n",
- " doc.tag_morph(morph_tagger)\n",
- " for token in doc.tokens:\n",
- " token.lemmatize(morph_vocab)\n",
- " # Фильтрация стоп-слов и коротких лемм\n",
- " return \" \".join([\n",
- " token.lemma for token in doc.tokens \n",
- " if token.lemma and len(token.lemma) > 2\n",
- " and token.lemma not in stopwords.words('english')\n",
- " ])\n",
- "\n",
- "# Применение обработки\n",
- "df['review'] = df['review'].apply(preprocess_text)\n",
- "df['processed_text'] = df['review'].apply(lemmatize_text)\n",
- "\n",
- "# Проверка данных после обработки\n",
- "print(\"Пример обработанных текстов:\")\n",
- "print(df['processed_text'].head(3), end='\\n\\n')\n",
- "\n",
- "# Векторизация\n",
- "vectorizer = TfidfVectorizer(\n",
- " max_features=1000,\n",
- " ngram_range=(1, 3), \n",
- " stop_words=stopwords.words('english'), # Добавление стоп-слов\n",
- " min_df=5, # Игнорирование редких слов\n",
- " max_df=0.7 # Игнорирование слишком частых слов\n",
- ")\n",
- "tfidf_matrix = vectorizer.fit_transform(df[\"processed_text\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f45b55d5-663b-4b14-89e5-8cd1ba359252",
- "metadata": {},
- "outputs": [],
- "source": [
- "try:\n",
- " X_train, X_test, y_train, y_test = train_test_split(\n",
- " tfidf_matrix, \n",
- " df['votes_up'], # Убедитесь, что столбец с метками существует!\n",
- " test_size=0.25,\n",
- " random_state=42\n",
- " )\n",
- "except KeyError as e:\n",
- " print(f\"Ошибка: {e} - проверьте наличие столбца с метками в данных!\")\n",
- " exit()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8c905d1b-44b2-4f27-8cf4-c21a148ffe7d",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 7. Настройка модели с GridSearch\n",
- "param_grid = {\n",
- " 'C': [0.1, 1, 10],\n",
- " 'penalty': ['l1', 'l2'],\n",
- " 'solver': ['liblinear', 'saga']\n",
- "}\n",
- "\n",
- "logreg = LogisticRegression(max_iter=1000)\n",
- "grid_search = GridSearchCV(logreg, param_grid, cv=5, n_jobs=-1)\n",
- "grid_search.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6bce774f-94a3-4f01-9213-3d208c8c9300",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 8. Оценка модели\n",
- "best_model = grid_search.best_estimator_\n",
- "y_pred = best_model.predict(X_test)\n",
- "\n",
- "print(\"\\nЛучшие параметры модели:\", grid_search.best_params_)\n",
- "print(\"\\nClassification Report:\")\n",
- "print(classification_report(y_test, y_pred))\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0cbcc13f-b585-4a34-abab-dfc320683281",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 9. Визуализация матрицы ошибок\n",
- "sns.heatmap(\n",
- " confusion_matrix(y_test, y_pred),\n",
- " annot=True,\n",
- " fmt='d',\n",
- " cmap='Blues',\n",
- " xticklabels=best_model.classes_,\n",
- " yticklabels=best_model.classes_\n",
- ")\n",
- "plt.xlabel('Predicted')\n",
- "plt.ylabel('True')\n",
- "plt.title('Confusion Matrix')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3b3f9fa6-d044-410e-94e9-57a468e6f3eb",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 10. Анализ важности признаков\n",
- "feature_names = vectorizer.get_feature_names_out()\n",
- "coefficients = best_model.coef_[0]\n",
- "top_features = np.argsort(coefficients)[-20:]\n",
- "\n",
- "print(\"\\nТоп-20 значимых признаков:\")\n",
- "for idx in top_features:\n",
- " print(f\"{feature_names[idx]}: {coefficients[idx]:.3f}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a4eb3a05-e84f-4576-b5ed-0cec03f1f787",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}