From 4818c999a6381d0df400225b70457916acc09138 Mon Sep 17 00:00:00 2001 From: bigbob123 Date: Sun, 23 Feb 2025 16:37:47 +0000 Subject: [PATCH] =?UTF-8?q?=D0=A3=D0=B4=D0=B0=D0=BB=D0=B8=D1=82=D1=8C=20mo?= =?UTF-8?q?dule=20V-checkpoint.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- module V-checkpoint.ipynb | 1108 ------------------------------------- 1 file changed, 1108 deletions(-) delete mode 100644 module V-checkpoint.ipynb diff --git a/module V-checkpoint.ipynb b/module V-checkpoint.ipynb deleted file mode 100644 index f5c968c..0000000 --- a/module V-checkpoint.ipynb +++ /dev/null @@ -1,1108 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "d792e3b7-4590-4545-a343-2c9ee606f50e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: natasha in c:\\programdata\\anaconda3\\lib\\site-packages (1.6.0)\n", - "Requirement already satisfied: pymorphy2 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.1)\n", - "Requirement already satisfied: razdel>=0.5.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.5.0)\n", - "Requirement already satisfied: navec>=0.9.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.10.0)\n", - "Requirement already satisfied: slovnet>=0.6.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.6.0)\n", - "Requirement already satisfied: yargy>=0.16.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.16.0)\n", - "Requirement already satisfied: ipymarkup>=0.8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from natasha) (0.9.0)\n", - "Requirement already satisfied: intervaltree>=3 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipymarkup>=0.8.0->natasha) (3.1.0)\n", - "Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from navec>=0.9.0->natasha) (1.26.4)\n", - "Requirement already satisfied: dawg-python>=0.7.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.7.2)\n", - "Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (2.4.417127.4579844)\n", - "Requirement already satisfied: docopt>=0.6 in c:\\programdata\\anaconda3\\lib\\site-packages (from pymorphy2->natasha) (0.6.2)\n", - "Requirement already satisfied: sortedcontainers<3.0,>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from intervaltree>=3->ipymarkup>=0.8.0->natasha) (2.4.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# устанавливаем библиотеку natasha для дальнейших работ\n", - "pip install natasha" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c6a64528-8064-41aa-83f1-591096e03080", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: bs4 in c:\\programdata\\anaconda3\\lib\\site-packages (0.0.2)\n", - "Requirement already satisfied: beautifulsoup4 in c:\\programdata\\anaconda3\\lib\\site-packages (from bs4) (4.12.3)\n", - "Requirement already satisfied: soupsieve>1.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from beautifulsoup4->bs4) (2.5)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# устанавливаем библиотеку bs4 (Beautiful Soup 4) для дальнейших работ\n", - "pip install bs4" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "407b9639-427f-4e2d-98a0-bafc0bba45f6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
recommendationidlanguagereviewtimestamp_createdtimestamp_updatedvoted_upvotes_upvotes_funnyweighted_vote_scorewritten_during_early_accesscomment_countsteam_purchasereceived_for_free
0149400957englishApparently, my favorite colour is red. Astario...16989603361698960336True100.00000False0TrueFalse
1150084129englishI literally got a new 1TB SSD to play this aft...16999130721699913072True100.00000False0TrueFalse
2148459881englishAn incredible game, Bg3 has incredible graphic...16976688911697668891True000.00000False0FalseFalse
379014545englishgood16049537621691397976True000.00000True0TrueFalse
4144059171englishI havent been this invested into a game since ...16919022451691902245True000.00000False0TrueFalse
..........................................
199577854487englishThe graphics, game play, and voice acting are ...16031686761634929644True000.00000True0TrueFalse
1996148174826englishI,m very new to a game like this but i already...16972537581697253758True000.00000False0TrueFalse
1997149257034englishA real work of art, tremendous amounts of deta...16987657181698765718True000.00000False0TrueFalse
1998121727958englishGame is still in flux, but I trust the develop...16622895471662289547True000.00000True0TrueFalse
1999144358746english⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...16922160131692216013True100.52381False0TrueFalse
\n", - "

2000 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " recommendationid language \\\n", - "0 149400957 english \n", - "1 150084129 english \n", - "2 148459881 english \n", - "3 79014545 english \n", - "4 144059171 english \n", - "... ... ... \n", - "1995 77854487 english \n", - "1996 148174826 english \n", - "1997 149257034 english \n", - "1998 121727958 english \n", - "1999 144358746 english \n", - "\n", - " review timestamp_created \\\n", - "0 Apparently, my favorite colour is red. Astario... 1698960336 \n", - "1 I literally got a new 1TB SSD to play this aft... 1699913072 \n", - "2 An incredible game, Bg3 has incredible graphic... 1697668891 \n", - "3 good 1604953762 \n", - "4 I havent been this invested into a game since ... 1691902245 \n", - "... ... ... \n", - "1995 The graphics, game play, and voice acting are ... 1603168676 \n", - "1996 I,m very new to a game like this but i already... 1697253758 \n", - "1997 A real work of art, tremendous amounts of deta... 1698765718 \n", - "1998 Game is still in flux, but I trust the develop... 1662289547 \n", - "1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n", - "\n", - " timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n", - "0 1698960336 True 1 0 0.00000 \n", - "1 1699913072 True 1 0 0.00000 \n", - "2 1697668891 True 0 0 0.00000 \n", - "3 1691397976 True 0 0 0.00000 \n", - "4 1691902245 True 0 0 0.00000 \n", - "... ... ... ... ... ... \n", - "1995 1634929644 True 0 0 0.00000 \n", - "1996 1697253758 True 0 0 0.00000 \n", - "1997 1698765718 True 0 0 0.00000 \n", - "1998 1662289547 True 0 0 0.00000 \n", - "1999 1692216013 True 1 0 0.52381 \n", - "\n", - " written_during_early_access comment_count steam_purchase \\\n", - "0 False 0 True \n", - "1 False 0 True \n", - "2 False 0 False \n", - "3 True 0 True \n", - "4 False 0 True \n", - "... ... ... ... \n", - "1995 True 0 True \n", - "1996 False 0 True \n", - "1997 False 0 True \n", - "1998 True 0 True \n", - "1999 False 0 True \n", - "\n", - " received_for_free \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False \n", - "... ... \n", - "1995 False \n", - "1996 False \n", - "1997 False \n", - "1998 False \n", - "1999 False \n", - "\n", - "[2000 rows x 13 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import re\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from natasha import (\n", - " Segmenter,\n", - " MorphVocab,\n", - " NewsEmbedding,\n", - " NewsMorphTagger,\n", - " Doc\n", - ")\n", - "import nltk\n", - "from nltk.corpus import stopwords\n", - "from string import punctuation\n", - "from nltk.tokenize import word_tokenize\n", - "# Закоментил, т.к выполнять это нужно всего 1 раз\n", - "# nltk.download('punkt')\n", - "# nltk.download('stopwords')\n", - "# nltk.download('punkt_tab')\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import GridSearchCV\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.metrics import confusion_matrix\n", - "from bs4 import BeautifulSoup\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "\n", - "df = pd.read_csv('sample-module-v.csv')\n", - "\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "72e59a17-88fe-47f5-bf1c-7ab49dbb1761", - "metadata": {}, - "outputs": [], - "source": [ - "df['review'] = df['review'].fillna('-------------------------------------------------------------------------------------------------')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cf36326a-5b93-4d29-b420-711e0f323aa6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
languagereview
0englishApparently, my favorite colour is red. Astario...
1englishI literally got a new 1TB SSD to play this aft...
2englishAn incredible game, Bg3 has incredible graphic...
3englishgood
4englishI havent been this invested into a game since ...
.........
1995englishThe graphics, game play, and voice acting are ...
1996englishI,m very new to a game like this but i already...
1997englishA real work of art, tremendous amounts of deta...
1998englishGame is still in flux, but I trust the develop...
1999english⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...
\n", - "

2000 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " language review\n", - "0 english Apparently, my favorite colour is red. Astario...\n", - "1 english I literally got a new 1TB SSD to play this aft...\n", - "2 english An incredible game, Bg3 has incredible graphic...\n", - "3 english good\n", - "4 english I havent been this invested into a game since ...\n", - "... ... ...\n", - "1995 english The graphics, game play, and voice acting are ...\n", - "1996 english I,m very new to a game like this but i already...\n", - "1997 english A real work of art, tremendous amounts of deta...\n", - "1998 english Game is still in flux, but I trust the develop...\n", - "1999 english ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...\n", - "\n", - "[2000 rows x 2 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "object_cols = df.select_dtypes('object')\n", - "object_cols" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d3044cb9-649c-407c-b1ff-35a8ef4688ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 2000 entries, 0 to 1999\n", - "Data columns (total 13 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 recommendationid 2000 non-null int64 \n", - " 1 language 2000 non-null object \n", - " 2 review 2000 non-null object \n", - " 3 timestamp_created 2000 non-null int64 \n", - " 4 timestamp_updated 2000 non-null int64 \n", - " 5 voted_up 2000 non-null bool \n", - " 6 votes_up 2000 non-null int64 \n", - " 7 votes_funny 2000 non-null int64 \n", - " 8 weighted_vote_score 2000 non-null float64\n", - " 9 written_during_early_access 2000 non-null bool \n", - " 10 comment_count 2000 non-null int64 \n", - " 11 steam_purchase 2000 non-null bool \n", - " 12 received_for_free 2000 non-null bool \n", - "dtypes: bool(4), float64(1), int64(6), object(2)\n", - "memory usage: 148.6+ KB\n" - ] - } - ], - "source": [ - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "547cc03f-f889-4d63-838c-b84ee566a4e1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "weighted_vote_score\n", - "0.000000 1539\n", - "0.523810 155\n", - "0.476190 26\n", - "0.545455 25\n", - "0.521739 18\n", - " ... \n", - "0.503801 1\n", - "0.510978 1\n", - "0.474167 1\n", - "0.501501 1\n", - "0.437956 1\n", - "Name: count, Length: 201, dtype: int64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['weighted_vote_score'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "05491808-33db-4c50-a957-1e9abb3fe1c0", - "metadata": {}, - "outputs": [], - "source": [ - "def strip_html_tags(html):\n", - " if pd.isna(html):\n", - " return html # Возвращаем NaN без изменений\n", - " soup = BeautifulSoup(html, 'html.parser')\n", - " return soup.get_text(separator=' ', strip=True)\n", - "\n", - "df['review'] = df['review'].apply(strip_html_tags)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8ce53318-f17d-4869-938a-f794be97378c", - "metadata": {}, - "outputs": [], - "source": [ - "df['review'] = df['review'].str.strip()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "b5e1e5fa-6e02-4660-84b9-1aa072f33513", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
recommendationidlanguagereviewtimestamp_createdtimestamp_updatedvoted_upvotes_upvotes_funnyweighted_vote_scorewritten_during_early_accesscomment_countsteam_purchasereceived_for_free
0149400957englishapparently, my favorite colour is red. astario...16989603361698960336True100.00000False0TrueFalse
1150084129englishi literally got a new 1tb ssd to play this aft...16999130721699913072True100.00000False0TrueFalse
2148459881englishan incredible game, bg3 has incredible graphic...16976688911697668891True000.00000False0FalseFalse
379014545englishgood16049537621691397976True000.00000True0TrueFalse
4144059171englishi havent been this invested into a game since ...16919022451691902245True000.00000False0TrueFalse
..........................................
199577854487englishthe graphics, game play, and voice acting are ...16031686761634929644True000.00000True0TrueFalse
1996148174826englishi,m very new to a game like this but i already...16972537581697253758True000.00000False0TrueFalse
1997149257034englisha real work of art, tremendous amounts of deta...16987657181698765718True000.00000False0TrueFalse
1998121727958englishgame is still in flux, but i trust the develop...16622895471662289547True000.00000True0TrueFalse
1999144358746english⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀...16922160131692216013True100.52381False0TrueFalse
\n", - "

2000 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " recommendationid language \\\n", - "0 149400957 english \n", - "1 150084129 english \n", - "2 148459881 english \n", - "3 79014545 english \n", - "4 144059171 english \n", - "... ... ... \n", - "1995 77854487 english \n", - "1996 148174826 english \n", - "1997 149257034 english \n", - "1998 121727958 english \n", - "1999 144358746 english \n", - "\n", - " review timestamp_created \\\n", - "0 apparently, my favorite colour is red. astario... 1698960336 \n", - "1 i literally got a new 1tb ssd to play this aft... 1699913072 \n", - "2 an incredible game, bg3 has incredible graphic... 1697668891 \n", - "3 good 1604953762 \n", - "4 i havent been this invested into a game since ... 1691902245 \n", - "... ... ... \n", - "1995 the graphics, game play, and voice acting are ... 1603168676 \n", - "1996 i,m very new to a game like this but i already... 1697253758 \n", - "1997 a real work of art, tremendous amounts of deta... 1698765718 \n", - "1998 game is still in flux, but i trust the develop... 1662289547 \n", - "1999 ⣿⣿⣿⣿⣿⣿⣿⣿⡿⠿⠛⠛⠛⠋⠉⠈⠉⠉⠉⠉⠛⠻⢿⣿⣿⣿⣿⣿⣿⣿\\n⣿⣿⣿⣿⣿⡿⠋⠁⠀⠀⠀⠀⠀⠀... 1692216013 \n", - "\n", - " timestamp_updated voted_up votes_up votes_funny weighted_vote_score \\\n", - "0 1698960336 True 1 0 0.00000 \n", - "1 1699913072 True 1 0 0.00000 \n", - "2 1697668891 True 0 0 0.00000 \n", - "3 1691397976 True 0 0 0.00000 \n", - "4 1691902245 True 0 0 0.00000 \n", - "... ... ... ... ... ... \n", - "1995 1634929644 True 0 0 0.00000 \n", - "1996 1697253758 True 0 0 0.00000 \n", - "1997 1698765718 True 0 0 0.00000 \n", - "1998 1662289547 True 0 0 0.00000 \n", - "1999 1692216013 True 1 0 0.52381 \n", - "\n", - " written_during_early_access comment_count steam_purchase \\\n", - "0 False 0 True \n", - "1 False 0 True \n", - "2 False 0 False \n", - "3 True 0 True \n", - "4 False 0 True \n", - "... ... ... ... \n", - "1995 True 0 True \n", - "1996 False 0 True \n", - "1997 False 0 True \n", - "1998 True 0 True \n", - "1999 False 0 True \n", - "\n", - " received_for_free \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False \n", - "... ... \n", - "1995 False \n", - "1996 False \n", - "1997 False \n", - "1998 False \n", - "1999 False \n", - "\n", - "[2000 rows x 13 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "for col in object_cols:\n", - " df[col] = df[col].str.lower()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "0323f4c4-7985-4f4d-8306-e77b70f91a4b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Пример обработанных текстов:\n", - "0 apparently favorite colour red astarion red fl...\n", - "1 literally got new ssd play update worth idk st...\n", - "2 incredible game incredible graphics lovable co...\n", - "Name: processed_text, dtype: object\n", - "\n" - ] - } - ], - "source": [ - "# Инициализация Natasha\n", - "segmenter = Segmenter()\n", - "morph_vocab = MorphVocab()\n", - "emb = NewsEmbedding()\n", - "morph_tagger = NewsMorphTagger(emb)\n", - "\n", - "# Предобработка текста\n", - "def preprocess_text(text):\n", - " # Улучшенное удаление спецсимволов (включая переносы строк)\n", - " text = re.sub(r'[^\\w\\s]|[\\d_]', ' ', text, flags=re.UNICODE)\n", - " text = re.sub(r'\\s+', ' ', text).strip().lower()\n", - " return text\n", - "\n", - "def lemmatize_text(text):\n", - " doc = Doc(text)\n", - " doc.segment(segmenter)\n", - " doc.tag_morph(morph_tagger)\n", - " for token in doc.tokens:\n", - " token.lemmatize(morph_vocab)\n", - " # Фильтрация стоп-слов и коротких лемм\n", - " return \" \".join([\n", - " token.lemma for token in doc.tokens \n", - " if token.lemma and len(token.lemma) > 2\n", - " and token.lemma not in stopwords.words('english')\n", - " ])\n", - "\n", - "# Применение обработки\n", - "df['review'] = df['review'].apply(preprocess_text)\n", - "df['processed_text'] = df['review'].apply(lemmatize_text)\n", - "\n", - "# Проверка данных после обработки\n", - "print(\"Пример обработанных текстов:\")\n", - "print(df['processed_text'].head(3), end='\\n\\n')\n", - "\n", - "# Векторизация\n", - "vectorizer = TfidfVectorizer(\n", - " max_features=1000,\n", - " ngram_range=(1, 3), \n", - " stop_words=stopwords.words('english'), # Добавление стоп-слов\n", - " min_df=5, # Игнорирование редких слов\n", - " max_df=0.7 # Игнорирование слишком частых слов\n", - ")\n", - "tfidf_matrix = vectorizer.fit_transform(df[\"processed_text\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f45b55d5-663b-4b14-89e5-8cd1ba359252", - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " X_train, X_test, y_train, y_test = train_test_split(\n", - " tfidf_matrix, \n", - " df['votes_up'], # Убедитесь, что столбец с метками существует!\n", - " test_size=0.25,\n", - " random_state=42\n", - " )\n", - "except KeyError as e:\n", - " print(f\"Ошибка: {e} - проверьте наличие столбца с метками в данных!\")\n", - " exit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c905d1b-44b2-4f27-8cf4-c21a148ffe7d", - "metadata": {}, - "outputs": [], - "source": [ - "# 7. Настройка модели с GridSearch\n", - "param_grid = {\n", - " 'C': [0.1, 1, 10],\n", - " 'penalty': ['l1', 'l2'],\n", - " 'solver': ['liblinear', 'saga']\n", - "}\n", - "\n", - "logreg = LogisticRegression(max_iter=1000)\n", - "grid_search = GridSearchCV(logreg, param_grid, cv=5, n_jobs=-1)\n", - "grid_search.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6bce774f-94a3-4f01-9213-3d208c8c9300", - "metadata": {}, - "outputs": [], - "source": [ - "# 8. Оценка модели\n", - "best_model = grid_search.best_estimator_\n", - "y_pred = best_model.predict(X_test)\n", - "\n", - "print(\"\\nЛучшие параметры модели:\", grid_search.best_params_)\n", - "print(\"\\nClassification Report:\")\n", - "print(classification_report(y_test, y_pred))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cbcc13f-b585-4a34-abab-dfc320683281", - "metadata": {}, - "outputs": [], - "source": [ - "# 9. Визуализация матрицы ошибок\n", - "sns.heatmap(\n", - " confusion_matrix(y_test, y_pred),\n", - " annot=True,\n", - " fmt='d',\n", - " cmap='Blues',\n", - " xticklabels=best_model.classes_,\n", - " yticklabels=best_model.classes_\n", - ")\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('True')\n", - "plt.title('Confusion Matrix')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b3f9fa6-d044-410e-94e9-57a468e6f3eb", - "metadata": {}, - "outputs": [], - "source": [ - "# 10. Анализ важности признаков\n", - "feature_names = vectorizer.get_feature_names_out()\n", - "coefficients = best_model.coef_[0]\n", - "top_features = np.argsort(coefficients)[-20:]\n", - "\n", - "print(\"\\nТоп-20 значимых признаков:\")\n", - "for idx in top_features:\n", - " print(f\"{feature_names[idx]}: {coefficients[idx]:.3f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4eb3a05-e84f-4576-b5ed-0cec03f1f787", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}