forked from TomLent1/GHScrunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ghscrunch.py
executable file
·788 lines (766 loc) · 39.6 KB
/
ghscrunch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
#!/usr/local/bin/python3
# ghscrunch.py
# Extract GHS chemical hazard classification information out of various
# international government documents. By Akos Kokai.
# Uses the xlrd module (http://www.python-excel.org/).
import xlrd
import csv
import argparse
def ghs_hazard(ref):
# Look up the hazard class based on GHS chapter reference.
# Accurate to GHS Revision 4.
ghs_chapters = {
'2.1': 'Explosives',
'2.2': 'Flammable gases',
'2.3': 'Aerosols',
'2.4': 'Oxidizing gases',
'2.5': 'Gases under pressure',
'2.6': 'Flammable liquids',
'2.7': 'Flammable solids',
'2.8': 'Self-reactive substances and mixtures',
'2.9': 'Pyrophoric liquids',
'2.10': 'Pyrophoric solids',
'2.11': 'Self-heating substances and mixtures',
'2.12': 'Substances and mixtures which, in contact with water, emit flammable gases',
'2.13': 'Oxidizing liquids',
'2.14': 'Oxidizing solids',
'2.15': 'Organic peroxides',
'2.16': 'Corrosive to metals',
'3.1': 'Acute toxicity',
'3.2': 'Skin corrosion/irritation',
'3.3': 'Serious eye damage/irritation',
'3.4': 'Respiratory or skin sensitization',
'3.5': 'Germ cell mutagenicity',
'3.6': 'Carcinogenicity',
'3.7': 'Reproductive toxicity',
'3.8': 'Specific target organ toxicity - Single exposure',
'3.9': 'Specific target organ toxicity - Repeated exposure',
'3.10': 'Aspiration hazard',
'4.1': 'Hazardous to the aquatic environment',
'4.2': 'Hazardous to the ozone layer'
}
return ghs_chapters[ref]
def h_statement(h):
# H-statements: List from GHS Revision 4.
# Did not include the abbreviated combinations (e.g. H302 + H332).
h_statements = {
'H200': 'Unstable explosive',
'H201': 'Explosive; mass explosion hazard',
'H202': 'Explosive; severe projection hazard',
'H203': 'Explosive; fire, blast or projection hazard',
'H204': 'Fire or projection hazard',
'H205': 'May mass explode in fire',
'H220': 'Extremely flammable gas',
'H221': 'Flammable gas',
'H222': 'Extremely flammable aerosol',
'H223': 'Flammable aerosol',
'H224': 'Extremely flammable liquid and vapour',
'H225': 'Highly flammable liquid and vapour',
'H226': 'Flammable liquid and vapour',
'H227': 'Combustible liquid',
'H228': 'Flammable solid',
'H229': 'Pressurized container: may burst if heated',
'H230': 'May react explosively even in the absence of air',
'H231': 'May react explosively even in the absence of air at elevated pressure and/or temperature',
'H240': 'Heating may cause an explosion',
'H241': 'Heating may cause a fire or explosion',
'H242': 'Heating may cause a fire',
'H250': 'Catches fire spontaneously if exposed to air',
'H251': 'Self-heating; may catch fire',
'H252': 'Self-heating in large quantities; may catch fire',
'H260': 'In contact with water releases flammable gases which may ignite spontaneously',
'H261': 'In contact with water releases flammable gas',
'H270': 'May cause or intensify fire; oxidizer',
'H271': 'May cause fire or explosion; strong oxidizer',
'H272': 'May intensify fire; oxidizer',
'H280': 'Contains gas under pressure; may explode if heated',
'H281': 'Contains refrigerated gas; may cause cryogenic burns or injury',
'H290': 'May be corrosive to metals',
'H300': 'Fatal if swallowed',
'H301': 'Toxic if swallowed',
'H302': 'Harmful if swallowed',
'H303': 'May be harmful if swallowed',
'H304': 'May be fatal if swallowed and enters airways',
'H305': 'May be harmful if swallowed and enters airways',
'H310': 'Fatal in contact with skin',
'H311': 'Toxic in contact with skin',
'H312': 'Harmful in contact with skin',
'H313': 'May be harmful in contact with skin',
'H314': 'Causes severe skin burns and eye damage',
'H315': 'Causes skin irritation',
'H316': 'Causes mild skin irritation',
'H317': 'May cause an allergic skin reaction',
'H318': 'Causes serious eye damage',
'H319': 'Causes serious eye irritation',
'H320': 'Causes eye irritation',
'H330': 'Fatal if inhaled',
'H331': 'Toxic if inhaled',
'H332': 'Harmful if inhaled',
'H333': 'May be harmful if inhaled',
'H334': 'May cause allergy or asthma symptoms or breathing difficulties if inhaled',
'H335': 'May cause respiratory irritation',
'H336': 'May cause drowsiness or dizziness',
'H340': 'May cause genetic defects',
'H341': 'Suspected of causing genetic defects',
'H350': 'May cause cancer',
'H351': 'Suspected of causing cancer',
'H360': 'May damage fertility or the unborn child',
'H361': 'Suspected of damaging fertility or the unborn child',
'H362': 'May cause harm to breast-fed children',
'H370': 'Causes damage to organs',
'H371': 'May cause damage to organs',
'H372': 'Causes damage to organs through prolonged or repeated exposure',
'H373': 'May cause damage to organs through prolonged or repeated exposure',
'H400': 'Very toxic to aquatic life',
'H401': 'Toxic to aquatic life',
'H402': 'Harmful to aquatic life',
'H410': 'Very toxic to aquatic life with long lasting effects',
'H411': 'Toxic to aquatic life with long lasting effects',
'H412': 'Harmful to aquatic life with long lasting effects',
'H413': 'May cause long lasting harmful effects to aquatic life',
'H420': 'Harms public health and the environment by destroying ozone in the upper atmosphere'
}
return h_statements[h]
def splitsens(info):
# For Japan GHS classifications.
# Splits apart info for respiratory sensitization and skin sensitization
# in a given row of cells. The hazard class name field, which says
# "Respiratory/skin sensitizer", should not be included in the list passed
# to this function. Returned list includes an automatically assigned field.
resp_list = ['Respiratory sensitizer']
skin_list = ['Skin sensitizer']
for s in info:
s = str(s)
if 'Skin' in s:
a = s.find('Skin')
resp_str = s[:a].rstrip(';([ \n\r')
resp_str = resp_str.replace('Respiratory sensitizer: ', '', 1)
resp_str = resp_str.replace('Respiratory Sensitizer: ', '', 1)
resp_str = resp_str.replace('Respiratory sensitization: ', '', 1)
resp_str = resp_str.replace('(Respiratory sensitization)', '', 1)
skin_str = s[a:].rstrip('; \n\r')
skin_str = skin_str.replace('Skin sensitizer: ', '', 1)
skin_str = skin_str.replace('Skin Sensitizer: ', '', 1)
skin_str = skin_str.replace('Skin sensitization: ', '', 1)
skin_str = skin_str.replace('Skin sensitization)', '', 1)
else:
resp_str = skin_str = s.rstrip(' \n')
resp_list = resp_list + [resp_str]
skin_list = skin_list + [skin_str]
return resp_list, skin_list
def update(chemical, hazard_class, datalist):
# For Japan GHS classifications.
# Copies spreadsheet data to the chemical classification record.
# Does not overwrite the original classification info with blank
# sections of the revised classification.
if hazard_class not in chemical:
chemical[hazard_class] = datalist
elif datalist[1] != '':
chemical[hazard_class] = datalist
def update_all(chemicals, source_file):
# For Japan GHS classifications.
# Creates or updates the dict of chemical classifications from a given
# spreadsheet. Specifying date allows revisions to be clearly seen, but
# not going to deal with parsing the dates given in the spreadsheets.
chembook = xlrd.open_workbook(source_file)
# Ignore the first sheet (it's just a list of chemicals in the workbook).
for chempage in range(1, chembook.nsheets):
chemsheet = chembook.sheet_by_index(chempage)
# Cells are identified by (row, col) where A1 is (0, 0).
id = chemsheet.cell_value(1,0).strip()
chemname = chemsheet.cell_value(1, 3).strip()
casrn_field = chemsheet.cell_value(2, 2).strip('- ')
# Must use something as unique ID if CASRN is blank. Provided index
# IDs are not the same across datasets (2007 - 2009). So, add a few
# characters of the chemical name (without introducing commas).
if casrn_field == '':
casrn_field = id + (chemname[:4] + chemname[-4:]).replace(',', '')
date = chemsheet.cell_value(2, 4)
# But I also want one CASRN per chemical listing.
for c in casrn_field.split(','):
casrn = c.strip()
if casrn not in chemicals:
chemicals[casrn] = dict(name=chemname)
# We are going to extract columns 2-7 for each of the rows.
# col 2: Hazard class name
# col 3: Classification
# col 4: Symbol
# col 5: Signal word
# col 6: Hazard statement
# col 7: Rationale for classification
update(chemicals[casrn], 'explosive',
chemsheet.row_values(5)[2:8] + [date])
update(chemicals[casrn], 'flamm_gas',
chemsheet.row_values(6)[2:8] + [date])
update(chemicals[casrn], 'flamm_aer',
chemsheet.row_values(7)[2:8] + [date])
update(chemicals[casrn], 'oxid_gas',
chemsheet.row_values(8)[2:8] + [date])
update(chemicals[casrn], 'gas_press',
chemsheet.row_values(9)[2:8] + [date])
update(chemicals[casrn], 'flamm_liq',
chemsheet.row_values(10)[2:8] + [date])
update(chemicals[casrn], 'flamm_sol',
chemsheet.row_values(11)[2:8] + [date])
update(chemicals[casrn], 'self_react',
chemsheet.row_values(12)[2:8] + [date])
update(chemicals[casrn], 'pyro_liq',
chemsheet.row_values(13)[2:8] + [date])
update(chemicals[casrn], 'pyro_sol',
chemsheet.row_values(14)[2:8] + [date])
update(chemicals[casrn], 'self_heat',
chemsheet.row_values(15)[2:8] + [date])
update(chemicals[casrn], 'water_fire',
chemsheet.row_values(16)[2:8] + [date])
update(chemicals[casrn], 'oxid_liq',
chemsheet.row_values(17)[2:8] + [date])
update(chemicals[casrn], 'oxid_sol',
chemsheet.row_values(18)[2:8] + [date])
update(chemicals[casrn], 'org_perox',
chemsheet.row_values(19)[2:8] + [date])
update(chemicals[casrn], 'cor_metal',
chemsheet.row_values(20)[2:8] + [date])
update(chemicals[casrn], 'acute_oral',
chemsheet.row_values(24)[2:8] + [date])
update(chemicals[casrn], 'acute_derm',
chemsheet.row_values(25)[2:8] + [date])
update(chemicals[casrn], 'acute_gas',
chemsheet.row_values(26)[2:8] + [date])
update(chemicals[casrn], 'acute_vap',
chemsheet.row_values(27)[2:8] + [date])
update(chemicals[casrn], 'acute_air',
chemsheet.row_values(28)[2:8] + [date])
update(chemicals[casrn], 'skin_cor',
chemsheet.row_values(29)[2:8] + [date])
update(chemicals[casrn], 'eye_dmg',
chemsheet.row_values(30)[2:8] + [date])
# For respiratory & skin sensitization, we need to split strings.
# Don't include cell 2, it's automatically added by splitsens().
resp_only, skin_only = splitsens(chemsheet.row_values(31)[3:8])
update(chemicals[casrn], 'resp_sens',
resp_only + [date])
update(chemicals[casrn], 'skin_sens',
skin_only + [date])
update(chemicals[casrn], 'mutagen',
chemsheet.row_values(32)[2:8] + [date])
update(chemicals[casrn], 'cancer',
chemsheet.row_values(33)[2:8] + [date])
update(chemicals[casrn], 'repr_tox',
chemsheet.row_values(34)[2:8] + [date])
update(chemicals[casrn], 'sys_single',
chemsheet.row_values(35)[2:8] + [date])
update(chemicals[casrn], 'sys_rept',
chemsheet.row_values(36)[2:8] + [date])
update(chemicals[casrn], 'asp_haz',
chemsheet.row_values(37)[2:8] + [date])
update(chemicals[casrn], 'aq_acute',
chemsheet.row_values(41)[2:8] + [date])
update(chemicals[casrn], 'aq_chronic',
chemsheet.row_values(42)[2:8] + [date])
def crunch_jp():
# Process the Japan GHS classifications (2006-2008).
GHS_jp_2006_files = [
'GHS-jp/classification_result_e(ID001-100).xls',
'GHS-jp/classification_result_e(ID101-200).xls',
'GHS-jp/classification_result_e(ID201-300).xls',
'GHS-jp/classification_result_e(ID301-400).xls',
'GHS-jp/classification_result_e(ID401-500).xls',
'GHS-jp/classification_result_e(ID501-600).xls',
'GHS-jp/classification_result_e(ID601-700).xls',
'GHS-jp/classification_result_e(ID701-800).xls',
'GHS-jp/classification_result_e(ID801-900).xls',
'GHS-jp/classification_result_e(ID901-1000).xls',
'GHS-jp/classification_result_e(ID1001-1100).xls',
'GHS-jp/classification_result_e(ID1101-1200).xls',
'GHS-jp/classification_result_e(ID1201-1300).xls',
'GHS-jp/classification_result_e(ID1301-1400).xls',
'GHS-jp/classification_result_e(ID1401-1424).xls'
]
GHS_jp_2007_files = [
'GHS-jp/METI_H19_GHS_review_e.xls',
'GHS-jp/METI_H19_GHS_new_e.xls'
]
GHS_jp_2008_files = [
'GHS-jp/METI_H20_GHS_review_e.xls',
'GHS-jp/METI_H20_GHS_new_e.xls'
]
# Initialize a dictionary of CASRN-identified chemicals.
# Each key will be a CASRN, and each corresponding value will itself be
# a dictionary with:
# - A key called 'name', with the substance name as its value.
# - Keys for each hazard class, with lists of relevant classification
# information as their values.
chemicals = dict()
# These are all the hazard class keywords that we will use.
hazard_classes = [
'explosive',
'flamm_gas',
'flamm_aer',
'oxid_gas',
'gas_press',
'flamm_liq',
'flamm_sol',
'self_react',
'pyro_liq',
'pyro_sol',
'self_heat',
'water_fire',
'oxid_liq',
'oxid_sol',
'org_perox',
'cor_metal',
'acute_oral',
'acute_derm',
'acute_gas',
'acute_vap',
'acute_air',
'skin_cor',
'eye_dmg',
'resp_sens',
'skin_sens',
'mutagen',
'cancer',
'repr_tox',
'sys_single',
'sys_rept',
'asp_haz',
'aq_acute',
'aq_chronic'
]
# First feed in the 2006 mass classification.
for filename in GHS_jp_2006_files:
update_all(chemicals, filename)
# Then add subsequent revisions and additions.
for filename in GHS_jp_2007_files:
update_all(chemicals, filename)
for filename in GHS_jp_2008_files:
update_all(chemicals, filename)
# Then, output a list of chemicals & their classification info for
# each hazard class.
# There will be no separate hazard class field in the output, because
# it will be combined with classification.
header = ['CASRN', 'Name', # 'Hazard class',
'Classification', 'Symbol', 'Signal word', 'Hazard statement',
'Rationale for classification', 'Date of classification']
# This will keep track of unique classifications.
sublists = set()
# I want the output to be in separate CSV files for each hazard class.
# Furthermore, I want separate files for "classification not possible",
# "not classified", and "not applicable".
nafile = open('GHS-jp/output/notapplicable.csv', 'w', newline='')
nawriter = csv.writer(nafile)
nawriter.writerow(header)
ncfile = open('GHS-jp/output/notclassified.csv', 'w', newline='')
ncwriter = csv.writer(ncfile)
ncwriter.writerow(header)
npfile = open('GHS-jp/output/notpossible.csv', 'w', newline='')
npwriter = csv.writer(npfile)
npwriter.writerow(header)
for h in hazard_classes:
with open('GHS-jp/output/' + h + '.csv', 'w', newline='') as outfile:
listwriter = csv.writer(outfile)
listwriter.writerow(header)
for c in sorted(chemicals.keys()):
# Mash the hazard class and category together...
category = str(chemicals[c][h][1]).replace('\n', ' ').strip()
s = str(chemicals[c][h][0]).strip() + ' - ' + category
# These conditions test for exact matches. Things like
# "Category 4 (m-cresol) Not applicable (o- and p-cresol)"
# will not be filtered out, since you want to know about that
# Category 4. Unfortunately, you have to edit these by hand.
# As a side effect, things like "Not applicable (aqueous
# solution)" will be kept even though they're useless.
if category == 'Not applicable':
nawriter.writerow([c] + [chemicals[c]['name']] +
[s] + chemicals[c][h][2:])
elif category == 'Not classified':
ncwriter.writerow([c] + [chemicals[c]['name']] +
[s] + chemicals[c][h][2:])
elif category == 'Classification not possible':
npwriter.writerow([c] + [chemicals[c]['name']] +
[s] + chemicals[c][h][2:])
elif category != '':
# Don't bother outputting rows of empty classifications
# (where no classification results were given).
sublists.add(s)
listwriter.writerow([c] + [chemicals[c]['name']] +
[s] + chemicals[c][h][2:])
nafile.close()
ncfile.close()
npfile.close()
# Output a list of unique classifications (hazard class + category) that
# appear in the hazard-specific output files.
with open('GHS-jp/output/classifications.txt', 'w') as classtxt:
for sub in sorted(sublists):
print(sub, file=classtxt)
# Also output an index of chemicals, just to check for problems.
with open('GHS-jp/output/index.csv', 'w', newline='') as outfile:
listwriter = csv.writer(outfile)
listwriter.writerow(['CASRN', 'Name'])
for c in sorted(chemicals.keys()):
listwriter.writerow([c] + [chemicals[c]['name']])
def crunch_kr():
# Process the Korea GHS classification (2011).
chembook = xlrd.open_workbook('GHS-kr/GHS-kr-2011-04-15.xls')
chemsheet = chembook.sheet_by_index(0)
outfile = open('GHS-kr/output/GHS-kr.csv', 'w', newline='')
listwriter = csv.writer(outfile)
# For practical purposes, I am going to combine the hazard class,
# category, and H-statement fields into one 'Hazard sublist' field.
listwriter.writerow(['CASRN', 'Name', 'Synonyms', 'Hazard sublist',
'M-factor'])
# I also want to enumerate the unique class/category/H-statement
# combinations (sublists).
sublists = set()
# Read in the spreadsheet; process and output results for each line.
for r in range(16,1208):
# Name: (r, 1)
# CASRN: (r, 3)
# Don't overwrite name and CASRN with blanks from merged cells.
if chemsheet.cell_value(r, 1) != '':
name_field = chemsheet.cell_value(r, 1)
# Split lists of synonyms into 2 fields.
names = name_field.split(';', 1)
for i in range(len(names)):
names[i] = names[i].strip()
while len(names) < 2:
names.append('')
if chemsheet.cell_value(r, 3) != '':
casrn_field = chemsheet.cell_value(r, 3)
# Hazard class (r, 4)
# Hazard category (r, 5)
# Pictogram code (r, 6) - (not used anymore?)
# Signal word (r, 7) - (in Korean)
# H-stmnt code (r, 8)
# M-factor (r, 9)
haz_class_field = chemsheet.cell_value(r, 4)
ref = haz_class_field[haz_class_field.find('('):].strip('()')
haz_class_en = ghs_hazard(ref)
if ref == '3.1':
if u'급성 독성-경구' in haz_class_field:
haz_class_en = 'Acute toxicity (oral)'
elif u'급성 독성-경피' in haz_class_field:
haz_class_en = 'Acute toxicity (dermal)'
elif u'급성 독성-흡입' in haz_class_field:
haz_class_en = 'Acute toxicity (inhalation)'
else:
print('Found a different hazard class 3.1 in row ' + str(r))
if ref == '3.4':
if u'피부 과민성' in haz_class_field:
haz_class_en = 'Skin sensitization'
elif u'호흡기 과민성' in haz_class_field:
haz_class_en = 'Respiratory sensitization'
else:
print('Found a different hazard class 3.4 in row ' + str(r))
if ref == '4.1':
if u'수생환경유해성-급성' in haz_class_field:
haz_class_en = 'Hazardous to the aquatic environment (acute)'
elif u'수생환경유해성-만성' in haz_class_field:
haz_class_en = 'Hazardous to the aquatic environment (chronic)'
else:
print('Found a different hazard class 4.1 in row ' + str(r))
# Category values are integers stored as floats.
category = 'Category ' + str(int(chemsheet.cell_value(r, 5)))
h_code = chemsheet.cell_value(r, 8)
h_state = h_code + ' - ' + h_statement(h_code)
# Make the combined hazard class/category/H-statement field:
s = haz_class_en + ' - ' + category + ' [' + h_state + ']'
sublists.add(s)
# Make M-factor field (though not really using it for anything now).
if chemsheet.cell_value(r, 9) != '':
m_factor = str(int(chemsheet.cell_value(r, 9)))
else:
m_factor = ''
# Ensure one CASRN per line when writing output:
for casrn in casrn_field.split(', '):
listwriter.writerow([casrn] + names + [s, m_factor])
outfile.close()
# Output some helpful information about the hazard sublists.
with open('GHS-kr/output/sublists.txt', 'w') as subtxt:
for sub in sorted(sublists):
print(sub, file=subtxt)
def crunch_nz():
# Process the HSNO CCID export.
# Translate HSNO classifications into GHS classifications, and perform
# some additional processing to filter out certain substances.
hsno_ghs = {
# These are GHS translations of the HSNO classes/categories,
# used to create a 'Hazard description' field.
'1.1': ['Explosives', 'Division 1.1'],
'1.2': ['Explosives', 'Division 1.2'],
'1.3': ['Explosives', 'Division 1.3'],
'1.4': ['Explosives', 'Division 1.4'],
'1.5': ['Explosives', 'Division 1.5'],
'1.6': ['Explosives', 'Division 1.6'],
'2.1.1A': ['Flammable gases', 'Category 1'],
'2.1.1B': ['Flammable gases', 'Category 2'],
'2.1.2A': ['Flammable aerosols', 'Category 1'],
'3.1A': ['Flammable liquids', 'Category 1'],
'3.1B': ['Flammable liquids', 'Category 2'],
'3.1C': ['Flammable liquids', 'Category 3'],
'3.1D': ['Flammable liquids', 'Category 4'],
'4.1.1A': ['Flammable solids', 'Category 1'],
'4.1.1B': ['Flammable solids', 'Category 2'],
'4.1.2A': ['Self-reactive substances and mixtures', 'Type A'],
'4.1.2B': ['Self-reactive substances and mixtures', 'Type B'],
'4.1.2C': ['Self-reactive substances and mixtures', 'Type C'],
'4.1.2D': ['Self-reactive substances and mixtures', 'Type D'],
'4.1.2E': ['Self-reactive substances and mixtures', 'Type E'],
'4.1.2F': ['Self-reactive substances and mixtures', 'Type F'],
'4.1.2G': ['Self-reactive substances and mixtures', 'Type G'],
# HSNO doesn't distinguish pyrophoric liquids and solids.
'4.2A': ['Pyrophoric substances', 'Category 1'],
'4.2B': ['Self-heating substances and mixtures', 'Category 1'],
'4.2C': ['Self-heating substances and mixtures', 'Category 2'],
'4.3A': ['Substances and mixtures, which in contact with water, emit flammable gases', 'Category 1'],
'4.3B': ['Substances and mixtures, which in contact with water, emit flammable gases', 'Category 2'],
'4.3C': ['Substances and mixtures, which in contact with water, emit flammable gases', 'Category 3'],
# HSNO doesn't distinguish between oxidizing liquids and solids
# but does distinguish them from oxidizing gases.
'5.1.1A': ['Oxidizing liquids/solids', 'Category 1'],
'5.1.1B': ['Oxidizing liquids/solids', 'Category 2'],
'5.1.1C': ['Oxidizing liquids/solids', 'Category 3'],
'5.1.2A': ['Oxidizing gases', 'Category 1'],
'5.2A': ['Organic peroxides', 'Type A'],
'5.2B': ['Organic peroxides', 'Type B'],
'5.2C': ['Organic peroxides', 'Type C'],
'5.2D': ['Organic peroxides', 'Type D'],
'5.2E': ['Organic peroxides', 'Type E'],
'5.2F': ['Organic peroxides', 'Type F'],
'5.2G': ['Organic peroxides', 'Type G'],
'6.1A (dermal)': ['Acute toxicity: Dermal', 'Category 1'],
'6.1A (inhalation)': ['Acute toxicity: Inhalation', 'Category 1'],
'6.1A (oral)': ['Acute toxicity: Oral', 'Category 1'],
'6.1B (dermal)': ['Acute toxicity: Dermal', 'Category 2'],
'6.1B (inhalation)': ['Acute toxicity: Inhalation', 'Category 2'],
'6.1B (oral)': ['Acute toxicity: Oral', 'Category 2'],
'6.1C (dermal)': ['Acute toxicity: Dermal', 'Category 3'],
'6.1C (inhalation)': ['Acute toxicity: Inhalation', 'Category 3'],
'6.1C (oral)': ['Acute toxicity: Oral', 'Category 3'],
'6.1D (dermal)': ['Acute toxicity: Dermal', 'Category 4'],
'6.1D (inhalation)': ['Acute toxicity: Inhalation', 'Category 4'],
'6.1D (oral)': ['Acute toxicity: Oral', 'Category 4'],
'6.1E (dermal)': ['Acute toxicity: Dermal', 'Category 5'],
'6.1E (inhalation)': ['Acute toxicity: Inhalation', 'Category 5'],
'6.1E (oral)': ['Acute toxicity: Oral', 'Category 5'],
'6.3A': ['Skin corrosion/irritation', 'Category 2'],
'6.3B': ['Skin corrosion/irritation', 'Category 3'],
# 6.4A is both Category 2A and 2B.
'6.4A': ['Serious eye damage/eye irritation', 'Category 2'],
'6.5A (respiratory)': ['Respiratory sensitization', 'Category 1'],
'6.5B (contact)': ['Skin sensitization', 'Category 1'],
# 6.6A is both Category 1A and 1B.
'6.6A': ['Germ cell mutagenicity', 'Category 1'],
'6.6B': ['Germ cell mutagenicity', 'Category 2'],
# 6.7A is both Category 1A and 1B.
'6.7A': ['Carcinogenicity', 'Category 1'],
'6.7B': ['Carcinogenicity', 'Category 2'],
# 6.8A is both Category 1A and 1B.
'6.8A': ['Reproductive toxicity', 'Category 1'],
'6.8B': ['Reproductive toxicity', 'Category 2'],
'6.8C': ['Reproductive toxicity', 'Effects on or via lactation'],
# HSNO doesn't distinguish between single or repeated exposure,
# but does distinguish among exposure routes.
'6.9A (dermal)': ['Specific Target Organ Systemic Toxicity', 'Category 1'],
'6.9A (inhalation)': ['Specific Target Organ Systemic Toxicity', 'Category 1'],
'6.9A (oral)': ['Specific Target Organ Systemic Toxicity', 'Category 1'],
'6.9A (other)': ['Specific Target Organ Systemic Toxicity', 'Category 1'],
'6.9B (dermal)': ['Specific Target Organ Systemic Toxicity', 'Category 2'],
'6.9B (inhalation)': ['Specific Target Organ Systemic Toxicity', 'Category 2'],
'6.9B (oral)': ['Specific Target Organ Systemic Toxicity', 'Category 2'],
'6.9B (other)': ['Specific Target Organ Systemic Toxicity', 'Category 2'],
'8.1A': ['Corrosive to metals', 'Category 1'],
'8.2A': ['Skin corrosion/irritation', 'Category 1A'],
'8.2B': ['Skin corrosion/irritation', 'Category 1B'],
'8.2C': ['Skin corrosion/irritation', 'Category 1C'],
'8.3A': ['Serious eye damage/eye irritation', 'Category 1'],
# In 9.1A, HSNO doesn't distinguish between acute and chronic.
'9.1A (algal)': ['Aquatic toxicity (Acute or Chronic)', 'Category 1'],
'9.1A (crustacean)': ['Aquatic toxicity (Acute or Chronic)', 'Category 1'],
'9.1A (fish)': ['Aquatic toxicity (Acute or Chronic)', 'Category 1'],
'9.1A (other)': ['Aquatic toxicity (Acute or Chronic)', 'Category 1'],
'9.1B (algal)': ['Aquatic toxicity (Chronic)', 'Category 2'],
'9.1B (crustacean)': ['Aquatic toxicity (Chronic)', 'Category 2'],
'9.1B (fish)': ['Aquatic toxicity (Chronic)', 'Category 2'],
'9.1B (other)': ['Aquatic toxicity (Chronic)', 'Category 2'],
'9.1C (algal)': ['Aquatic toxicity (Chronic)', 'Category 3'],
'9.1C (crustacean)': ['Aquatic toxicity (Chronic)', 'Category 3'],
'9.1C (fish)': ['Aquatic toxicity (Chronic)', 'Category 3'],
'9.1C (other)': ['Aquatic toxicity (Chronic)', 'Category 3'],
# The mapping of 9.1D to GHS is very odd.
'9.1D (algal)': ['Aquatic toxicity', 'Category 2-3 (Acute) or Category 4 (Chronic)'],
'9.1D (crustacean)': ['Aquatic toxicity', 'Category 2-3 (Acute) or Category 4 (Chronic)'],
'9.1D (fish)': ['Aquatic toxicity', 'Category 2-3 (Acute) or Category 4 (Chronic)'],
'9.1D (other)': ['Aquatic toxicity', 'Category 2-3 (Acute) or Category 4 (Chronic)'],
# Classes that aren't GHS-translatable:
'3.2A': '', # Liquid desensitized explosives
'3.2B': '', # Liquid desensitized explosives
'3.2C': '', # Liquid desensitized explosives
'4.1.3A': '', # Solid desensitized explosives: high hazard
'4.1.3B': '', # Solid desensitized explosives: medium hazard
'4.1.3C': '', # Solid desensitized explosives: low hazard
'9.2A': '', # Ecotoxic to soil environment
'9.2B': '', # Ecotoxic to soil environment
'9.2C': '', # Ecotoxic to soil environment
'9.2D': '', # Ecotoxic to soil environment
'9.3A': '', # Ecotoxic to terrestrial vertebrates
'9.3B': '', # Ecotoxic to terrestrial vertebrates
'9.3C': '', # Ecotoxic to terrestrial vertebrates
'9.4A': '', # Ecotoxic to terrestrial invertebrates
'9.4B': '', # Ecotoxic to terrestrial invertebrates
'9.4C': '', # Ecotoxic to terrestrial invertebrates
}
ccidbook = xlrd.open_workbook('GHS-nz/CCID Key Studies (4 June 2013).xls')
ccid = ccidbook.sheet_by_index(0)
# Initialize a dictionary of CASRN-identified chemicals. See below...
chemicals = dict()
# Also, enumerate the unique classifications (sublists).
sublists = dict()
# Read in the spreadsheet and generate GHS translations.
for r in range(1, ccid.nrows):
# CASRN (r, 0)
# Substance name (r, 1)
# Approval (r, 2) - ignored
# Classification Text (r, 3)
# Classification Code (r, 4)
# Key Study (r, 5)
casrn = str(ccid.cell_value(r, 0)).strip()
# There is conveniently one substance without a CASRN. If there were
# more, it might pose a problem for the redundancy filtering (below).
if casrn == '':
casrn = 'no_id'
name = ccid.cell_value(r, 1).strip()
# The following needs to be known for every substance:
c = str(ccid.cell_value(r, 4)) # Classification code
k = str(ccid.cell_value(r, 5)) # Key study
# Fix inconsistent spaces around punctuation (for style):
if '(' in c:
c = c[:c.index('(')].strip() + ' ' + c[c.index('('):].strip()
# Keep track of what classifications actually show up in the dataset,
# and store them in a dict where the keys are classification codes.
if c not in sublists:
# Tidy up classification text:
t = ccid.cell_value(r, 3)
if ':' in t:
t = t[:t.index(':')].strip() + ': ' + \
t[t.index(':')+1:].strip()
# I also want to combine classification codes and text, e.g.
# "3.1D - Flammable Liquids: low hazard"
s = c + ' - ' + t
# Find the appropriate GHS translation, if any.
if hsno_ghs[c] != '':
# For my purposes I want it to say 'GHS: ' at the beginning.
g = 'GHS: ' + hsno_ghs[c][0] + ' - ' + hsno_ghs[c][1]
else:
g = ''
sublists[c] = [s, t, g]
# Now put the chemical classifications into a convoluted data
# structure from which we can filter out redundant variants.
# In the dictionary chemicals, each key is a CASRN, and each
# corresponding value is itself a dictionary. The keys of that dict
# are all the different chemical names assigned to that CASRN.
# The values for those keys will be dictionaries (!) where the keys
# are classification codes and the values are key study summaries.
if casrn not in chemicals:
chemicals[casrn] = {name: {c: k}}
elif name not in chemicals[casrn]:
chemicals[casrn][name] = {c: k}
elif c in chemicals[casrn][name]:
chemicals[casrn][name][c] = chemicals[casrn][name][c] + '\n' + k
else:
chemicals[casrn][name][c] = k
# Create output files...
outfile_inc = open('GHS-nz/output/GHS-nz.csv', 'w', newline='')
outfile_var = open('GHS-nz/output/variants.csv', 'w', newline='')
outfile_exc = open('GHS-nz/output/exclude.csv', 'w', newline='')
writer_inc = csv.writer(outfile_inc)
writer_var = csv.writer(outfile_var)
writer_exc = csv.writer(outfile_exc)
header = ['CASRN', 'Substance name', 'HSNO code',
'HSNO classification text', 'GHS translation', 'Key study']
writer_inc.writerow(header)
writer_var.writerow(header)
writer_exc.writerow(header)
# The following section attempts to filter the list so that pure
# substances, solutions, and 'redundant' solutions are output in separate
# files. This is done for practical reasons, to avoid minting hundreds of
# identifiers for differently-dilute solutions of the same chemical.
for casrn in sorted(chemicals.keys()):
# The list of names given to this CASRN:
names = sorted(chemicals[casrn].keys())
# Find the principal (definitely non-redundant) substance from the
# list of names. If they all contain %, then there's no pure substance.
# If there are multiple names which do not contain %, then all but one
# of those will end up in the redundant list. So far, the only such
# case is a substance that seems redundant anyway.
p = -1
for i in range(len(names)):
if '%' not in names[i]:
p = i
break
# If we didn't find any pure substances, assume they are all
# potentially non-redundant; output and continue to next CASRN.
if p == -1:
for j in range(len(names)):
thisclass = chemicals[casrn][names[j]]
for c in sorted(thisclass.keys()):
writer_var.writerow(
['_v' + str(j) + '_' + casrn, names[j], c] +
sublists[c][1:] + [thisclass[c]])
continue
# Having found the principal substance, pop it out of the list of
# names, save its set of classifications, and output them.
pname = names.pop(p)
pclass = chemicals[casrn][pname]
pset = pclass.keys()
for c in sorted(pset):
writer_inc.writerow(
[casrn, pname, c] + sublists[c][1:] + [pclass[c]])
# Next, screen the rest of the named substances against the principal.
# Since these all should be variants of the principal substance, I'll
# add a flag to the CASRN field to help with identifier wrangling.
for n in range(len(names)):
thisclass = chemicals[casrn][names[n]]
thisset = set(thisclass.keys())
if thisset <= pset:
# Redundant: All classifications are included within the
# principal substance's classifications.
for c in sorted(thisset):
writer_exc.writerow(
['_v' + str(n) + '_' + casrn, names[n], c] +
sublists[c][1:] + [thisclass[c]])
else:
# Not redundant, but set aside for further scrutiny.
for c in sorted(thisset):
writer_var.writerow(
['_v' + str(n) + '_' + casrn, names[n], c] +
sublists[c][1:] + [thisclass[c]])
outfile_inc.close()
outfile_var.close()
outfile_exc.close()
# Output some helpful information about the classification sublists.
subs = sorted(sublists.keys())
subfile = open('GHS-nz/output/sublists.csv', 'w', newline='')
subwriter = csv.writer(subfile)
subwriter.writerow(['HSNO code', 'HSNO classification', 'GHS translation'])
for sl in subs:
subwriter.writerow([sl] + [sublists[sl][0], sublists[sl][2]])
subfile.close()
def main():
parser = argparse.ArgumentParser(description='Extract GHS hazard \
classifications from country-specific documents.')
parser.add_argument('countries', action='store', nargs='+',
choices=['jp', 'kr', 'nz'],
help='Process GHS classifications from these countries.')
args = parser.parse_args()
if 'jp' in args.countries:
print('Processing Japan GHS classifications.')
crunch_jp()
if 'kr' in args.countries:
print('Processing Republic of Korea GHS classifications.')
crunch_kr()
if 'nz' in args.countries:
print('Processing Aotearoa New Zealand HSNO classifications.')
crunch_nz()
if __name__ == '__main__':
main()