This repository was archived by the owner on Feb 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare.py
More file actions
92 lines (68 loc) · 3.41 KB
/
prepare.py
File metadata and controls
92 lines (68 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from nltk.corpus import stopwords
import nltk
import pandas as pd
import numpy as np
import csv
import re
import pymorphy2
#Constants:
RAW_DATA_PATH = "./data/precedents.csv"
SEP_DATA_PATH = "./data/fixed_precedents.csv"
MERGED_DATA_PATH = "./data/documents.csv"
FREQUENCY_DICTIONARY = "./data/frequency.dic"
STOPWORDS_PATH = "data/stopwords.txt"
BAD_BLOCK_BEG = 62000
BAD_BLOCK_END = 82000
def data_seperate(output_path = SEP_DATA_PATH):
raw_data = pd.read_csv(RAW_DATA_PATH, sep=",")
raw_data["place"] = raw_data["place"].map(lambda x: str(x).replace('\\n', '').replace('\\r', '').replace('\\t', '').replace('\"', '').replace('\'', '').lower())
data_bottom, data_top = raw_data[:BAD_BLOCK_BEG], raw_data[:BAD_BLOCK_END]
raw_data = pd.concat([data_bottom, data_top])
raw_data = raw_data.drop_duplicates(keep='first', inplace=False )
raw_data = raw_data[~raw_data.place.str.contains("nan")]
new = raw_data["place"].str.split(",", n=3, expand=True)
new[0] = new[0].str.lstrip('(')
new[3] = new[3].str.rstrip(')')
new[0] = new[0].str.strip()
new[1] = new[1].str.strip()
new[2] = new[2].str.strip()
new[3] = new[3].str.strip()
new[4] = raw_data["precedent"].map(lambda x: str(x).lower().replace('\\n', '').replace('\\r', '').replace('\\t', '').replace('\n', ''))
new = new.rename(columns={
0: 'subsidiary',
1: 'contractor',
2: 'worktype',
3: 'place',
4: 'description'})
new.to_csv(output_path, index=True, quoting=csv.QUOTE_ALL)
def data_merge(input_path = SEP_DATA_PATH, output_path = MERGED_DATA_PATH):
stop_words_data = pd.read_csv(STOPWORDS_PATH, delimiter=',')
stop_words = set(stop_words_data['words'])
data = pd.read_csv(input_path, delimiter=',', quotechar='\"')
text_data = (data['subsidiary'] + ' '
+ data['contractor'].str.replace('не привлекался', '') + ' '
+ data['worktype'].str.replace('не определена', '') + ' '
+ data['place'] + ' '
+ ' ' + data['description']).str.strip()
text_data = pd.DataFrame(text_data)
text_data = text_data.rename(columns={0: 'text'})
morph = pymorphy2.MorphAnalyzer()
pattern = re.compile('[^А-яЁё_]+', re.UNICODE)
text_data['text'] = text_data['text'].map(lambda s: ' '.join(map(lambda s: morph.parse(s)[0].normal_form,
list(filter(lambda s: s not in stop_words and len(s.strip()) > 0,
[pattern.sub('', w) for w in str(s).strip().split(' ')])))))
# print(text_data)
text_data.to_csv(output_path, index=True, quoting=csv.QUOTE_ALL)
def build_dictonary(input_path = SEP_DATA_PATH, output_path = FREQUENCY_DICTIONARY):
stop_words_data = pd.read_csv(STOPWORDS_PATH, delimiter=',')
stop_words = set(stop_words_data['words'])
data = pd.read_csv(input_path, delimiter=',')
morph = pymorphy2.MorphAnalyzer()
pattern = re.compile('[^А-яЁё_]+', re.UNICODE)
data['description'] = data['description'].map(lambda s: ' '.join(map(lambda s: morph.parse(s)[0].normal_form,
list(filter(lambda s: s not in stop_words and len(s.strip()) > 0,
[pattern.sub('', w) for w in str(s).strip().split(' ')])))))
for msg in data['description']:
for word in str(msg).split(' '):
with open(output_path, 'a') as dictionary_freq:
dictionary_freq.write(str(word) + "\n")