Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto-detect column and row coordinates from niid data #175

Merged
merged 3 commits into from
Jan 6, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 121 additions & 86 deletions tdb/niid_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import subprocess
import unicodedata
from parse import parse
import xlrd
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
import logging
# logger = logging.getLogger()
from titer_block import find_titer_block, find_serum_rows, find_virus_columns

parser.add_argument('--assay_type', default='hi')

Expand All @@ -27,97 +27,132 @@ def read_niid(path, fstem, subtype, assay_type):
if real_file != '':
print("real_file: " + real_file)
ind = '.{}'.format(real_file.split('.')[-1])
convert_xls_to_csv(path, fstem, ind)
fname = "data/tmp/{}.csv".format(fstem)
parse_niid_matrix_to_tsv(fname, path, subtype, assay_type)
convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type)

def convert_xls_to_csv(path, fstem, ind):
import xlrd
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for sheet in workbook.sheets():
with open('data/tmp/%s.csv'%(fstem), 'w') as f:
writer = csv.writer(f)
rows = []
for i in range(sheet.nrows):
row = []
for j in range(sheet.ncols):
val = sheet.cell_value(i, j)
row.append(val)
rows.append(row)
writer.writerows(rows)
return

def parse_niid_matrix_to_tsv(fname, original_path, subtype, assay_type):
def convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type):
# Set flutype
suptype=subtype.lower()
flutype = ""
if subtype == "h3n2" or subtype == "h1n1pdm":
flutype = "A"
if subtype == "vic" or subtype == "yam":
flutype = "B"
src_id = fname.split('/')[-1]
with open(fname) as infile:
csv_reader = csv.reader(infile)
mat = list(csv_reader)
with open('data/tmp/%s.tsv'%(src_id[:-4]), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))
original_path = original_path.split('/')
try:
original_path.remove('')
except:
pass
if subtype == "h3n2":
serum_id_row_index = 5 #5
start_row = 7
virus_id_col_index = 1
start_col = 4
elif subtype == "h1n1pdm":
serum_id_row_index = 5
start_row = 6
virus_id_col_index = 1
start_col = 4
elif subtype == "vic":
serum_id_row_index = 4
start_row = 5
virus_id_col_index = 1
start_col = 4
elif subtype == "yam":
serum_id_row_index = 3
start_row = 5
virus_id_col_index = 1
start_col = 4
for i in range(start_row, len(mat)):
for j in range(start_col, len(mat[0])):
virus_strain = mat[i][virus_id_col_index]
serum_id = mat[serum_id_row_index][j]
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
# import pdb; pdb.set_trace()
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', mat[i][j])
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(src_id)
virus_passage = mat[i][2]
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)

# Set NIID patterns
virus_pattern = r"[A-Z]/[\w\s-]+/.+/\d{4}"
virus_passage_pattern = r"(MDCK|SIAT|E\d+|hCK)"
serum_id_pattern = r".+(No\.|no\.).+"
serum_passage_pattern = r".+(Egg|Cell).+"
serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*"
crick = False
Comment on lines +45 to +47
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-blocking

Since the returned serum_passage_row_idx, serum_abbrev_row_idx, and serum_mapping are not being used, do we even need to set these patterns?

Copy link
Contributor Author

@j23414 j23414 Jan 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! The serum_abbrev_row_idx pattern is used to get the row coordinate. I miswrote that it was not being used:

[Edit after I read the comment more carefully
re: serum_abbrev_row_id VIDRL data had separate serum_id and serum_abbr rows while NIID data combines them into one row. You're correct I could have set this to None and rely on default pattern:

fauna/tdb/titer_block.py

Lines 230 to 231 in 47c06e8

if serum_abbrev_pattern is None:
serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*"

]

Regarding the others:

  • serum_passage_row_idx: Could be optional, I left it in to potentially validate that the serum passage matches the same row as serum_abbrev_row_idx and isn't missing.
  • serum_mapping: Could be optional, or eventually refined to become the official NIID serum mapping.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I should have been clearer in my comment. I was confused by the serum_passage_pattern and serum_abbrev_pattern because that the actual data parsing was using completely separate patterns (serum strain patterns and passage patterns).


# Open workbook
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1):
print(f"Reading worksheet {worksheet_index} '{worksheet.name}' in file '{fstem}'")
# autodetecting titer, virus, serum blocks
titer_block = find_titer_block(worksheet)

if len(titer_block["col_start"]) == 0:
print("No titer block found.")
break

titer_coords = {
'col_start': titer_block["col_start"][0][0],
'col_end': titer_block["col_end"][0][0],
'row_start': titer_block["row_start"][0][0],
'row_end': titer_block["row_end"][0][0]
}

virus_block = find_virus_columns(
worksheet=worksheet,
titer_coords=titer_coords,
virus_pattern=virus_pattern,
virus_passage_pattern=virus_passage_pattern,
)

# If no virus names are found, might not be a valid worksheet, skip worksheet to avoid breaking find_serum_rows
if virus_block["virus_names"] is None:
print(f"Virus names not found. Check the virus pattern: '{virus_pattern}'")
break

serum_block = find_serum_rows(
worksheet=worksheet,
titer_coords=titer_coords,
virus_names=virus_block["virus_names"],
serum_id_pattern=serum_id_pattern,
serum_passage_pattern=serum_passage_pattern,
serum_abbrev_pattern=serum_abbrev_pattern,
crick=crick,
)

# Print the most likely row and column indices for the titer block
print(f"Titer block: n = {titer_block['row_start'][0][1]}x{titer_block['col_start'][0][1]} = {titer_block['row_start'][0][1]*titer_block['col_start'][0][1]}")
print(f" Most likely (n={titer_block['col_start'][0][1]}) col_start: {titer_block['col_start'][0][0]}")
print(f" Most likely (n={titer_block['col_end'][0][1]}) col_end: {titer_block['col_end'][0][0]}")
print(f" Most likely (n={titer_block['row_start'][0][1]}) row_start: {titer_block['row_start'][0][0]}")
print(f" Most likely (n={titer_block['row_end'][0][1]}) row_end: {titer_block['row_end'][0][0]}")

# For debugging purposes, print alternative indices (e.g. col_start, col_end, row_start, row_end)
# print("Alternative indices:")
# for i in range(1, len(titer_block['row_start'])):
# print(f" Alternative (n={titer_block['row_start'][i][1]}) row_start: {titer_block['row_start'][i][0]}")

# Print Virus and Serum annotations row and column indices
print("Virus (antigen) block: left and right of the titer block")
print(f" virus column index: {virus_block['virus_col_idx']}")
print(f" virus passage column index: {virus_block['virus_passage_col_idx']}")
print(f" virus names: {virus_block['virus_names']}")

print("Serum (antisera) block: above the titer block")
print(f" serum ID row index: {serum_block['serum_id_row_idx']}")
print("Serum strain and serum passage will be parsed from serum ID row")

mat = worksheet

with open('data/tmp/%s.tsv'%(fstem), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))

serum_id_row_index = serum_block['serum_id_row_idx']
row_start = titer_coords['row_start']
row_end = titer_coords['row_end']
virus_id_col_index = virus_block['virus_col_idx']
virus_passage_col_index=virus_block['virus_passage_col_idx']
col_start = titer_coords['col_start']
col_end = titer_coords['col_end']

for i in range(row_start, row_end+1):
for j in range(col_start, col_end+1):
virus_strain = str(mat.cell_value(i,virus_id_col_index)).strip()
serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
Comment on lines +131 to +138
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-blocking

Seems like this should be replaced by the serum_map, am I missing the reason why the serum map cannot be used?

Suggested change
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
serum_strain = serum_mapping[serum_id]

Copy link
Contributor Author

@j23414 j23414 Jan 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly I just wasn't confident my mapping was matching the behavior of the original mapping. For example:

envdir ~/nextstrain/env.d/seasonal-flu/ \
  python tdb/niid_upload.py \
  -db niid_tdb \
  --virus flu \
  --subtype h1n1pdm \
  --assay_type hi \
  --path /Users/jchang3/nextstrain/fludata/NIID-Tokyo-WHO-CC/raw-data/A/H1N1pdm/HI/2024/ \
  --fstem H1pdm09_HIdata\(20241003\) \
  --ftype niid \
  --preview &>> my_log/niid_H1pdm09_HIdata\(20241003\).txt

titer_parse.py gave serum_mapping values like:

serum_mapping = {
    'Wisconsin/67/22CellNo.111': 'A/Wisconsin/67/2022',
    'Victoria/4897/22EggNo.121': 'A/Victoria/4897/2022',
    'Victoria/4897/22(IVR-238)No.112': 'A/Victoria/4897/2022 IVR-238',
    'Norway/25089/22CellNo.152': 'A/Norway/25089/2022',
    'Sydney/5/21CellNo.131': 'A/Sydney/5/2021',
    'India/PUN-NIV323546/21CellNo.112': 'A/India/PUN-NIV323546/2021',
    'KANAGAWA/IC1920/20CellNo.111': 'A/KANAGAWA/IC1920/2020',
    'OKINAWA/93/19CellNo.171': 'A/OKINAWA/93/2019',
}

While checking the original parsing of serum_names (2nd column) in visidata, gave serum names like:

visidata data/tmp/H1pdm09_HIdata\(20241003\).tsv

#> Example, only showing 2nd column of serum_names
#> A/Wisconsin/67/22
#> A/Victoria/4897/22
#> A/Victoria/4897/22(
#> A/Norway/25089/22
#> A/Sydney/5/21
#> A/India/PUN-NIV323546/21
#> A/KANAGAWA/IC1920/20
#> A/OKINAWA/93/19

Therefore I relied on the original behavior code, till I had time to dig further into this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, agree this can wait till later!

From first glance, the serum mapping includes the complete strain name so it could be switched over. However, I just noticed the A/Victoria/4897/2022 IVR-238 strain, which needs to drop IVR-238 🤔

# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', str(mat.cell_value(i,j)).strip())
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(fstem).strip()
virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip()
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)

def determine_subtype(original_path):
original_path = original_path.lower().split('/')
Expand Down