Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: speed up writing model fields #393

Merged
merged 9 commits into from
Jun 17, 2022
139 changes: 72 additions & 67 deletions code/io/importYaml.m
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,25 @@
error('Yaml file %s cannot be found', string(yamlFilename));
end

if verLessThan('matlab','9.9') %readlines introduced 2020b
fid=fopen(yamlFilename);
line_raw=cell(1000000,1);
while ~feof(fid)
line_raw{i}=fgetl(fid);
i=i+1;
end
line_raw(i:end)=[];
line_raw=string(line_raw);
else
line_raw=readlines(yamlFilename');
end

line_key=regexprep(line_raw,'^ *-? ([^:]+)(:).*','$1');
line_key=regexprep(line_key,'(.*!!omap)|(---)','');

line_value = regexprep(line_raw, '[^":]+: "?(.+)"?$','$1');
line_value = regexprep(line_value, '"','');

% Define the required fields of humanGEM
% There are a total of 37 fields in the model so far, the non-generic ones
% are excluded here
Expand Down Expand Up @@ -65,51 +84,55 @@
rightEqns={};
objRxns={};


% Load Yaml format model

fid = fopen(yamlFilename);
if ~silentMode
fprintf('Start importing...\n');
end

section = 0;
while ~feof(fid)
tline = fgetl(fid);

for i=1:numel(line_key)
tline_raw = line_raw{i};
tline_key = line_key{i};
tline_value = line_value{i};
% import different sections
change_to_section = 0;
switch tline
switch tline_raw
case '- metaData:'
change_to_section = 1;
section = 1;
if ~silentMode
fprintf('\t%d\n', section);
end
continue % Go to next line
case '- metabolites:'
change_to_section = 2;
section = 2;
if ~silentMode
fprintf('\t%d\n', section);
end
continue
case '- reactions:'
change_to_section = 3;
section = 3;
readSubsystems = false;
readEquation = false;
rxnId = '';
if ~silentMode
fprintf('\t%d\n', section);
end
continue
case '- genes:'
change_to_section = 4;
section = 4;
if ~silentMode
fprintf('\t%d\n', section);
end
continue
case '- compartments: !!omap'
change_to_section = 5;
end
if logical(change_to_section)
section = change_to_section;
tline = fgetl(fid);
if ~silentMode
fprintf('\t%d\n', section);
end
section = 5;
if ~silentMode
fprintf('\t%d\n', section);
end
continue
end

% skip over lines containing only omap
if any(regexp(tline, "- !!omap"))
tline = fgetl(fid);
% skip over empty keys
if isempty(tline_key)
continue;
end

% import metaData
if section == 1
[tline_key, tline_value] = tokenizeYamlLine(tline);
switch tline_key
case 'short_name'
model.id = tline_value;
Expand All @@ -134,7 +157,6 @@

% import metabolites:
if section == 2
[tline_key, tline_value] = tokenizeYamlLine(tline);
switch tline_key
case 'id'
model = readFieldValue(model, 'mets', tline_value);
Expand All @@ -155,7 +177,6 @@

% import reactions:
if section == 3
[tline_key, tline_value] = tokenizeYamlLine(tline);
switch tline_key
case 'id'
model = readFieldValue(model, 'rxns', tline_value);
Expand All @@ -164,13 +185,13 @@
model = readFieldValue(model, 'rxnNames', tline_value);

case 'lower_bound'
model.lb = [model.lb; tline_value];
leftEqns = [leftEqns; leftEquation];
rightEqns = [rightEqns; rightEquation];
model.lb(end+1,1) = {tline_value};
leftEqns(end+1,1) = {leftEquation};
rightEqns(end+1,1) = {rightEquation};
readEquation = false;

case 'upper_bound'
model.ub = [model.ub; tline_value];
model.ub(end+1,1) = {tline_value};

case 'gene_reaction_rule'
model = readFieldValue(model, 'grRules', tline_value);
Expand All @@ -182,7 +203,7 @@
model = readFieldValue(model, 'rxnFrom', tline_value);

case 'objective_coefficient'
objRxns = [objRxns; rxnId];
objRxns(end+1,1) = {rxnId};

case 'eccodes'
model = readFieldValue(model, 'eccodes', tline_value);
Expand All @@ -196,33 +217,33 @@

case 'confidence_score'
model = readFieldValue(model, 'rxnConfidenceScores', tline_value);
model.subSystems = [model.subSystems; {subSystems}];
model.subSystems(end+1,1) = {subSystems};
readSubsystems = false;

case 'metabolites'
readEquation = true;
leftEquation = {''};
rightEquation = {''};
leftEquation = '';
rightEquation = '';

otherwise
if readSubsystems
subSystems = [subSystems; regexprep(tline_key, '"', '')];
subSystems(end+1,1) = {regexprep(tline_value, '^ *- (.+)$','$1')};

% resolve the equation
elseif readEquation
metCoeffi = regexp(regexprep(tline, ' +- ', ''), ': ', 'split');
coeffi = str2num(metCoeffi{2});
if coeffi < 0
metCoeffi = regexp(regexprep(tline_raw, ' +- ', ''), ': ', 'split');
coeffi = metCoeffi{2};
if str2double(coeffi) < 0
if strcmp(leftEquation, '')
leftEquation = strcat(num2str(abs(coeffi), 12),32,metCoeffi{1});
leftEquation = [coeffi(2:end),' ',metCoeffi{1}]; %Remove minus sign from coefficient
else
leftEquation = strcat(leftEquation,' +',32,num2str(abs(coeffi), 12),32,metCoeffi{1});
leftEquation = [leftEquation,' + ',coeffi(2:end),' ',metCoeffi{1}];
end
else
if strcmp(rightEquation, '')
rightEquation = strcat(32,num2str(coeffi, 12),32,metCoeffi{1});
rightEquation = [' ',coeffi,' ',metCoeffi{1}];
else
rightEquation = strcat(rightEquation,' +',32,num2str(coeffi, 12),32,metCoeffi{1});
rightEquation = [rightEquation,' + ',coeffi,' ',metCoeffi{1}];
end
end
end
Expand All @@ -231,20 +252,16 @@

% import genes:
if section == 4
[tline_key, tline_value] = tokenizeYamlLine(tline);
model = readFieldValue(model, 'genes', tline_value);
end

% import compartments:
if section == 5
[tline_key, tline_value] = tokenizeYamlLine(tline);
model.comps = [model.comps; tline_key];
model.compNames = [model.compNames; tline_value];
model.comps(end+1,1) = {tline_key};
model.compNames(end+1,1) = {tline_value};
end

end
fclose(fid);


% follow-up data processing
if ~silentMode
Expand Down Expand Up @@ -286,17 +303,5 @@
end

function model = readFieldValue(model, fieldName, value)
model.(fieldName) = [model.(fieldName); {value}];
model.(fieldName)(end+1,1) = {value};
end

function [line_key, line_value]= tokenizeYamlLine(line)
line_key = regexp(line, '^ *-? ([^:]+)', 'tokens');
line_key = char(line_key{1});
line_value = regexp(line, '^ [^:]+: "?(.+)"?$', 'tokens');
if isempty(line_value)
line_value = '';
else
line_value = regexprep(line_value{1}, '"', '');
line_value = char(line_value{1});
end
end
6 changes: 3 additions & 3 deletions model/Human-GEM.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
short_name: "Human-GEM"
full_name: "Generic genome-scale metabolic model of Homo sapiens"
version: ""
date: "2021-12-18"
date: "2022-06-14"
authors: "Jonathan Robinson, Hao Wang, Pierre-Etienne Cholley, Pinar Kocabas"
email: "jonrob@chalmers.se"
organization: "Chalmers University of Technology"
Expand Down Expand Up @@ -292964,7 +292964,7 @@
- rxnNotes: "DOI:10.1007/978-1-4419-0840-7"
- rxnFrom: "Recon3D"
- eccodes: ""
- references: PMID:17655371"
- references: "PMID:17655371"
- subsystem:
- "Drug metabolism"
- confidence_score: 0
Expand Down Expand Up @@ -305769,8 +305769,8 @@
- id: "MAR13086"
- name: ""
- metabolites: !!omap
- MAM02847n: -1
- MAM02847c: 1
- MAM02847n: -1
- lower_bound: 0
- upper_bound: 1000
- gene_reaction_rule: "ENSG00000030066 and ENSG00000047410 and ENSG00000058804 and ENSG00000069248 and ENSG00000075188 and ENSG00000085415 and ENSG00000093000 and ENSG00000094914 and ENSG00000095319 and ENSG00000101146 and ENSG00000102900 and ENSG00000108559 and ENSG00000110713 and ENSG00000111581 and ENSG00000113569 and ENSG00000119392 and ENSG00000120253 and ENSG00000124789 and ENSG00000125450 and ENSG00000126883 and ENSG00000132182 and ENSG00000136243 and ENSG00000138750 and ENSG00000139496 and ENSG00000153201 and ENSG00000153207 and ENSG00000155561 and ENSG00000157020 and ENSG00000157349 and ENSG00000163002 and ENSG00000196313 and ENSG00000213024"
Expand Down