-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathscanner.py
More file actions
164 lines (133 loc) · 4.95 KB
/
scanner.py
File metadata and controls
164 lines (133 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# coding=utf-8
import os
import re
import math
import pickle
def generate_the_set_of_distinct_keywords_for_one_doc(doc):
"""
针对单个文档,提取出该文档的单词(不重复)
:param doc: 文档名
:return: 列表,保存有该文档的单词
"""
with open(doc, encoding='UTF-8') as f:
words = f.read()
words = re.split(' |\t|\n|\.|,|\?|!|:|;', words) # 分解字符串,提取单词
distinct_word = list(set(words))
return distinct_word
DISTINCT_WORD_LST_OF_EACH_DOC = []
DISTINCT_WORD_LST = []
def get_file_count(proj_dir_path=''):
"""
获取明文文档的数目
:return:
"""
try:
DIR = proj_dir_path + 'plain_text'
if not os.path.isdir(DIR):
DIR = proj_dir_path + 'cipher_text'
if not os.path.isdir(DIR):
with open(proj_dir_path + 'config', 'rb') as f:
return pickle.load(f)[3]
cnt = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])
with open(proj_dir_path + 'config', 'rb') as f:
tmp = pickle.load(f)[:3]
with open(proj_dir_path + 'config', 'wb') as f:
pickle.dump(tmp + [cnt], f)
return cnt
except (IndexError, EOFError, FileNotFoundError):
return 0
def generate_the_set_of_distinct_keywords_for_docs(proj_dir_path=''):
"""
针对一个文档集,提取出单词集(不重复)和各个文档的单词集
:return:
"""
global DISTINCT_WORD_LST_OF_EACH_DOC, DISTINCT_WORD_LST
if DISTINCT_WORD_LST and DISTINCT_WORD_LST_OF_EACH_DOC:
return DISTINCT_WORD_LST_OF_EACH_DOC, DISTINCT_WORD_LST
DIR = proj_dir_path + 'plain_text'
file_count = get_file_count(proj_dir_path)
for i in range(file_count):
lst = generate_the_set_of_distinct_keywords_for_one_doc(DIR + '/' + str(i) + '.txt')
DISTINCT_WORD_LST_OF_EACH_DOC.append(list(set(lst)))
DISTINCT_WORD_LST.extend(lst)
DISTINCT_WORD_LST = list(set(DISTINCT_WORD_LST))
return DISTINCT_WORD_LST_OF_EACH_DOC, DISTINCT_WORD_LST
def generate_Dw_for_each_keyword(proj_dir_path=''):
"""
Dw: the set of identifiers of documents in D that contain keyword w ordered in lexicographic order
:param w: 关键词w
:return: 列表
"""
Dw = {} # Dw应该是一个dict,方便检索
distinct_keyword_lst_of_each_doc, distinct_keyword_lst = generate_the_set_of_distinct_keywords_for_docs(proj_dir_path)
for word in distinct_keyword_lst:
for doc_index in range(len(distinct_keyword_lst_of_each_doc)):
if word in distinct_keyword_lst_of_each_doc[doc_index]:
if Dw.get(word, None) is None:
Dw[word] = []
Dw[word].append(doc_index)
return Dw
s = 0
def get_s(proj_dir_path=''):
"""
s is the total size of the encrypted document collection in "min-units"
:return:
"""
try:
global s
if s != 0:
return s
DIR = proj_dir_path + 'plain_text'
if not os.path.isdir(DIR):
with open(proj_dir_path + 'config', 'rb') as f:
return pickle.load(f)[2]
list_dir = os.walk(DIR)
for root, dirs, files in list_dir:
for f in files:
fname = os.path.join(root, f)
with open(fname, encoding='UTF-8') as doc:
s += len(doc.read())
# print(s)
if s == 0:
return 0
s = int(math.ceil(math.log2(s)))
with open(proj_dir_path + 'config', 'rb') as f:
tmp = pickle.load(f)
tmp[2] = s
with open(proj_dir_path + 'config', 'wb') as f:
pickle.dump(tmp, f)
return s
except FileNotFoundError:
return 0
def check_filename_format(proj_dir_path=''):
"""
检查plain_text目录下的文件名是否符合格式(0,1,2,...)
:return:
"""
DIR = proj_dir_path + 'plain_text'
file_cnt = get_file_count(proj_dir_path)
for i in range(file_cnt):
if not os.path.exists(os.path.join(DIR, str(i) + '.txt')):
return False
return True
def reformat_filename(proj_dir_path=''):
"""
重新命名plain_text目录下的文件名以符合格式要求(0,1,2,...)
update 0317: 将文件名放在文件中一起保存
:return:
"""
DIR = proj_dir_path + 'plain_text'
file_cnt = get_file_count()
for root, dirs, files in os.walk(DIR):
i = 0
for file in files:
file_path = os.path.join(DIR, file)
with open(file_path, 'a+', encoding='utf-8') as f:
f.seek(0, 0)
f.write('\n' + file)
os.rename(file_path, os.path.join(DIR, str(i) + '.txt'))
i += 1
if __name__ == '__main__':
distinct_keyword_lst_of_each_doc, distinct_keyword_lst = generate_the_set_of_distinct_keywords_for_docs("test_x/")
Dw = generate_Dw_for_each_keyword("test_x/")
print(Dw['China'])