-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_cleaner.py
572 lines (420 loc) · 18.7 KB
/
script_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
"""
Clean a script file to conversations.
Note that first 20-30 lines may not be properly good in output
As it learns from those lines where char scripts really beging
and calculates whitespaces..
--inputdir to specify directory for txt files, else use working directory
--file to specify single file only
--debug for debug output
Base file formats:
Format 1 : Standart movie scripts
TITLE
Some long description etc...
CHARNAME
(some intent)
talks some lines etc
OTHERCHARNAME
responds to chat
Some other things that are not dialog.
Format 2 : Non intended scripts
CHARNAME : says something
OTHERCHAR: response
Some other things that are not dialog\n
"""
import statistics
import os
import pprint
from argparse import ArgumentParser
from argparse import RawDescriptionHelpFormatter
import codecs
import logging
from pprint import pformat
logger = logging.getLogger(__file__)
logger.setLevel(level=logging.INFO)
def convert_file_to_script(filename, test_name_change=False):
""" Convert a script file that is non standart
to standart script format
use test_name_change=True to first test if file should be converted
if returninf filename is starts with PRO_ then this file should be converted
a tmp/ folder in file folder is created if converted
Non standart scripts are like
NAMECHAR: something said
OTHERCHAR: responding something else
"""
##print("INPUT: " + filename)
basefile = os.path.basename(filename)
path = os.path.dirname(filename)
remove_outfile = False
col_type_counter = 0
linecounter = 0
threshold = 10
outfilename = "PRO_"+basefile
outpath = path + "./tmp/"
if not os.path.exists(outpath):
os.mkdir(outpath)
with open(filename,encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
with open(os.path.join(outpath,outfilename), "w") as fw:
for line in lines:
if test_name_change:
linecounter += 1
if linecounter>200:
logging.debug("Script ok no conversion: "+filename)
remove_outfile = True
break
col_in_line = line[:15].find(":")
name = None
if col_in_line>0:
if test_name_change:
col_type_counter += 1
if col_type_counter>threshold:
logging.debug("Script needs conversion : "+outfilename)
return os.path.join(outpath,outfilename)
name = " "*10 + line[:col_in_line].upper()+"\n"
line = " "*5 + line[col_in_line+1:]+"\n"
if not test_name_change:
if name is not None:
fw.write(name)
fw.write(line)
if remove_outfile:
os.remove(os.path.join(outpath,outfilename))
#print("Should remove out file")
return filename
return os.path.join(outpath,outfilename)
def get_left_whitepace_length(text):
""" Get whitespace length on left side """
left_space = len(text) - len(text.lstrip(' ')) #space
left_tab = len(text) - len(text.lstrip('\t')) #tab
return left_space+left_tab
def get_right_whitepace_length(text):
""" Get whitespace length on right side """
right_space = len(text) - len(text.rstrip())
return right_space
def is_line_character_name(text,min_whitespace_or_tab=4):
""" Check if this text is a name on movie script format.
Multiple conditions are check.
Usually a name is first line all capital in center :
SOMENAME
Some line talked by name
"""
#see if there is indentation in beginning
##TODO try regexp here for tab or space
if (get_left_whitepace_length(text))<min_whitespace_or_tab:
return False
if get_right_whitepace_length(text)==0:
return False
if text.lstrip()[0].isdigit() :
#do not accept char name starting with digit
return False
if len(text.strip()) > 15 :
#do not accept longer than 15 char name as name
return False
#check if all caps
all_caps = text.isupper()
if all_caps:
return True
#new check
##check if Starts with capital and in 15 chars there is a :
"""
##this messes 2001
if all_caps:
#assume character name is max 15 chars!
if len(text)<15:
return True
"""
return False
def is_next_chapter_or_section(text, second_pass=False):
""" Check if text is a chapter
(Not a name, not whitespace, not dialog ) """
stripped = text.strip()
if stripped.startswith("="):
#ignore math output
return False
if len(stripped)<=0:
##just an empty line!
return False
if stripped[0].isdigit() and stripped[-1].isdigit() and stripped.isupper():
return True
if stripped == len(stripped) * '*' :
##line consist of only *)
return True
if not stripped[0].isalpha() and stripped.count('-')>=3 :
return True
if stripped[0].isdigit() and len(stripped)<4:
#and stripped[1] == " "
#can have answer with digit! but need more for it to be text
return True
if second_pass and stripped.startswith("["):
return True
if second_pass is True and stripped[0] == "(" and stripped[-1] ==")":
##probaly something about character but ignoring
return True
if stripped[-1].isdigit() and len(stripped)<4:
##can be a mathematical output so last char can be digit
#was put here for right page numbers added alpha check
#= ...3
return True
#do not convert to upper case check if upper chars are inside
if stripped.startswith("INSIDE "):
return True
#upper_stripped = stripped.upper()
if 'CUT TO' in stripped.upper():
##should not hit here if it was char continuation anyway
return True
#no not a section
return False
def should_bypass_line(text):
""" If line should be bypassed.
Bypass conditions:
all whitespace
has certain keys in text uppercase (CONTINUE etc)
Bypass and ignore """
stripped = text.lstrip()
#contains character action/drama
#if(text.strip().startswith("(")):
# #bypass special char starst
# return True
if "CONTINUE" in text:
return True
if text.isspace():
return True
return False
def get_line_length_after_name(text):
""" Calculate line after splitting with :
This is used for non standart conversion script """
splitted = text.split(":")
#first is name and ends until :
if len(splitted)>1:
return len( splitted[1] )
else:
return 0
def get_clean_script(filename,debug=False, line_count=None, left_whitespace_start=2, name_trigger=30):
""" Generate a cleaned script format file.
Note that this is not a really standart format
just a format that persona file parser can undersand
"""
#Start leftspace count
#will update once 10 character names found
#usually it is 4 or 6 but found instances where 2 tabs is used
max_left_whitespace = left_whitespace_start
out_dict = {}
out_dict["names"] = None
out_dict["dialog"] = None
out_dict["name_count"] = 0
out_dict["num_script_lines"] = 0
with open(filename) as f:
script_lines = 0
name_list = []
try:
if line_count is None:
lines = f.readlines()
else:
lines = f.readlines()[0:line_count]
except Exception as e:
logging.warning(f"Cannot open {filename}")
#raise(e)
#print(e)
return out_dict
prev_line_is_char_or_dialog = False
dialog = []
char_line = ""
char_line_calculated_left_space = [0]
NAME_TRIGGER = name_trigger
num_names = 0
get_left_whitespace_count_after_this_line = False
print_this_line = False
if debug : print_this_line = True
char_not_talking=0
for line in lines:
debug_header = ''
if prev_line_is_char_or_dialog:
debug_header += 'PrevIsLineoRChar,'
#print_this_line = True
if should_bypass_line(line):
#print(f"pass: {line}")
debug_header += 'Bypassing,'
prev_line_is_char_or_dialog = False
#if print_this_line: print(debug_header + line)
continue
##line stripping is not done above as name check etc depend on it
if (char_not_talking > 5) or is_next_chapter_or_section(line):
##this is a section make sure we append char talk
if(get_line_length_after_name(char_line) > 1 ):
#check if it is actually line, if it is empty or single char/whitespace bypass it
#script_lines += 1
dialog.append(char_line)
debug_header += 'appendline'
if debug: print(char_line)
char_line = ""
##10 line no script assume section
##Need for dialogue sepeartion.. Just add a line in dialog
dialog.append("*"*15)
debug_header += 'nextSection,'
if print_this_line: logger.debug(debug_header+ " | " +line)
if debug: logger.debug("*"*10)
char_not_talking += 1
##should continue to next line
if is_line_character_name(line, min_whitespace_or_tab=max_left_whitespace):
prev_line_is_char_or_dialog = True
name = line.strip()
num_names += 1
name_list.append(name)
get_left_whitespace_count_after_this_line = True
##append previous char_line
if(get_line_length_after_name(char_line) > 1 ):
#check if it is actually line, if it is empty or single char/whitespace bypass it
#script_lines += 1
dialog.append(char_line)
debug_header += 'appendline'
if debug: logger.debug(char_line)
#start new char_line
char_line = "\n\t" + name + " :"
debug_header += 'NameChar,'
else:
if prev_line_is_char_or_dialog:
space_length = get_left_whitepace_length(line)
##need auto left linespace calculation filling the array as tagged
if get_left_whitespace_count_after_this_line:
char_line_calculated_left_space.append(space_length)
get_left_whitespace_count_after_this_line = False
debug_header += 'getwhitespace,'
if space_length >= max_left_whitespace:
char_line += " " + line.strip()
#print("Line of char:" + char_line , end= " ")
prev_line_is_char_or_dialog = True
char_not_talking = 0
debug_header += 'charistalking,'
if len(char_line_calculated_left_space)>20 and num_names>=NAME_TRIGGER:
max_left_whitespace = statistics.median(char_line_calculated_left_space)
NAME_TRIGGER = 999999
debug_header += 'resetleft,'
script_lines += 1
if print_this_line: logger.debug(debug_header + " | " + line)
#continue
else:
#end of character or script
char_not_talking +=1
debug_header += 'charNOTtalking,'
prev_line_is_char_or_dialog = False
else:
char_not_talking += 1
debug_header += 'notprev,'
if(get_line_length_after_name(char_line) > 1 ):
#check if it is actually line, if it is empty or single char/whitespace bypass it
#script_lines += 1
dialog.append(char_line)
debug_header += 'appendline'
if debug: logger.debug(char_line)
char_line = ""
if is_next_chapter_or_section(line,second_pass=True) and char_not_talking>5:
dialog.append("*"*25)
debug_header += 'nextSection,'
if print_this_line: print(debug_header+ " | " +line)
if debug: logger.debug("*"*10)
continue
debug_header +='endelse,'
if print_this_line: logger.debug(debug_header + " | " +line)
debug_header +='endLine,'
if print_this_line: logger.debug(debug_header + " | " +line)
out_dict["names"] = set(name_list)
out_dict["dialog"] = dialog
out_dict["name_count"] = num_names
out_dict["num_script_lines"] = script_lines
return out_dict
def get_filename_to_process(filename):
"""Test filename if conversion is needed.
Convert it if necessary and return filename"""
if convert_file_to_script(filename, test_name_change=True) != filename:
logger.info("Converting {}".format(filename))
filename = convert_file_to_script(filename)
#print("Use this file:"+ filename)
return filename
def write_output(out, path,filename ):
""" Write out to text file in path and filename indicated """
basefile = os.path.basename(filename)
out_file = os.path.join(path, basefile)
with open(out_file, 'w') as f:
#prev_line = None
prev_is_section = False
for line in out["dialog"]:
##Check if section
if (prev_is_section) and ('***********' in line):
continue
elif '***********' in line:
prev_is_section = True
else:
prev_is_section = False
#if(line != prev_line):
###Multi * and same lines ignored
f.write(f"{line}\n")
return out_file
def __main():
"""
Main file runs in current folder with default args
"""
#description=''
#epilog=""
parser = ArgumentParser(description=__doc__, #prog='Movie Script Cleaner',
formatter_class=RawDescriptionHelpFormatter,
)
parser.add_argument("--inputdir", type=str, default="./", help="Path or url of the text scripts. Current directory by default")
parser.add_argument("--file", type=str, default="", help="File process mode conversion only")
parser.add_argument("--debug", action='store_true', help="Debug output")
#other examples
#parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
#parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt
#parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
#parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
#parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO )
#logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes
logger.info("Arguments: %s", pformat(args))
if args.file != "":
path = "./tmp/"
logger.info("Get file to process..")
filename = get_filename_to_process(args.file)
logger.info(filename)
logger.info("Get clean script..")
##this will get left starting ones, but may miss some other ones..
out = get_clean_script(filename,debug=args.debug,name_trigger=25,left_whitespace_start=0)
logger.info( path + filename)
logger.info("Write output..")
outfile = write_output(out,path,filename)
logger.info("DONE , written to: ", outfile)
return True
goodfiles3 = []
for filename in os.listdir(args.inputdir):
if not filename.endswith(".txt"):
continue
##do conversion if necessary and get filename of that
filename = get_filename_to_process(filename)
##this will get left starting ones, but may miss some other ones..
out = get_clean_script(filename,debug=False,name_trigger=25,left_whitespace_start=0)
name_count = 0
num_script_line =0
if "name_count" in out:
name_count = out["name_count"]
if "num_script_lines" in out:
num_script_line = out["num_script_lines"]
logger.debug(f"Filename:{filename}, counted:{name_count} names, has:{num_script_line} script lines")
if name_count > 5 and num_script_line > 100 and len(out["dialog"])>10:
goodfiles3.append([os.path.join(args.inputdir, filename), name_count,num_script_line, "35_0"])
pp = pprint.PrettyPrinter(indent=1)
logger.info("File List:\n %s", pp.pformat(goodfiles3))
#make a folder for output
path = os.path.join(args.inputdir, 'output')
if not os.path.exists(path):
os.mkdir(path)
good_file_array= goodfiles3
logger.info("Writing files...")
for filename in [row[0] for row in good_file_array]:
##will first get array
out = get_clean_script(filename,debug=args.debug,name_trigger=25,left_whitespace_start=0)
##DO IF: if out["name_count"]>5 and out["num_script_lines"]>100 and len(out["dialog"])>10:
outfile = write_output(out,path,filename)
logger.debug("Written to: %s", outfile)
logger.info("Files written to: %s", path)
logger.info("DONE")
if __name__ == '__main__':
__main()