-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_word.py
50 lines (38 loc) · 1.62 KB
/
split_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python3
import os
import random
file_list = os.listdir('data/newsgroup')
random.shuffle(file_list)
MAX_LINE_NUM = 10
for change_file in file_list:
if len(change_file.split('.')) == 2:
change_2_file_list = []
print('change_file',change_file)
def random_line(line):
new_list = []
if len(line) ==0:
return new_list
range = random.randint(4, 8)
if len(line) > range:
split_num = random.randint(4, len(line)-1)
split_part1 = line[:split_num]
split_part2 = line[split_num:]
return random_line(split_part1) + random_line(split_part2)
else:
new_list += [line+'\n']
return new_list
with open(os.path.join('data/newsgroup',change_file), 'r', encoding='UTF-8') as f:
for l in f.readlines():
line = l.strip()
line = line.replace(' ','')
# line = line.decode('utf-8')
new_line_list = []
if len(line) > MAX_LINE_NUM:
new_list = random_line(line)
new_line_list += new_list
else:
new_line_list.append(line+'\n')
change_2_file_list += new_line_list
with open(os.path.join('data/newsgroup/new',change_file), 'w', encoding='UTF-8') as f:
f.writelines(change_2_file_list)
print('all finished')