Several small fixes

- bugs resulting from char/string incompatibilities - bug concerning Neuromorphometrics atlas - bug concerning "phenotype" field when using custom data - missing n_rois field when using custom parcellation
LeonDLotter · Apr 21, 2022 · bc345d0 · bc345d0
1 parent ab2343c
commit bc345d0
Show file tree

Hide file tree

Showing 13 changed files with 100 additions and 46 deletions.
diff --git a/ABAnnotate.m b/ABAnnotate.m
@@ -21,11 +21,11 @@
            'or data (opt.phenotype_data)!']);
 end
 
-% get directory & start logging
+% get directories & start logging
 wd = fileparts(which('ABAnnotate'));
-tempdir = fullfile(wd, 'temp')
+tempdir = fullfile(wd, 'temp');
 if ~exist(tempdir, 'dir')
-    mkdir(tempdir)
+    mkdir(tempdir);
 end
 logfile = fullfile(tempdir, 'logfile_temp.txt');
 diary(logfile);
@@ -41,19 +41,23 @@
 if ~isfield(opt, 'analysis_name') && isfield(opt, 'phenotype')
     [~, opt.analysis_name, ~] = fileparts(opt.phenotype);
 elseif ~isfield(opt, 'analysis_name')
-    opt.analysis_name = ['enrich_' date];
+    opt.analysis_name = sprintf('enrich_%s', date);
+else
+    opt.analysis_name = char(opt.analysis_name);
 end
-print_section(['Starting ABAnnotate: ' opt.analysis_name]);
+print_section(sprintf('Starting ABAnnotate: %s', opt.analysis_name));
 
 
 %% ------------------------------------------------------------------------
 % Prepare imaging files & null maps
 % -------------------------------------------------------------------------
 
-% set atlas & aba data ---------------------------------------------------
+% set atlas & aba data ----------------------------------------------------
+
 % default: SchaeferTian-116
 if ~isfield(opt, 'atlas'); opt.atlas = 'Schaefer100-7_TianS1'; end
 % choose atlas - currently only SchaeferTian116, Schaefer100, and Neuromorphometrics defined
+opt.atlas = char(opt.atlas);
 switch opt.atlas
     case {'SchaeferTian', 'Schaefer100-7_TianS1'}
         atlas = 'Schaefer100-7_TianS1';
@@ -67,8 +71,10 @@
     otherwise
         atlas_type = 'user';
 end
+
 % get atlas
 switch atlas_type 
+
     case 'integrated'
         atlas_dir = fullfile(wd, 'atlas', atlas);
         if ~exist(atlas_dir, 'dir')
@@ -82,15 +88,17 @@
             delete(atlas_zip);
         end
         % set atlas
-        opt.atlas = fullfile(wd, 'atlas', atlas, [atlas '_atlas.nii']);
-        opt.aba_mat = fullfile(wd, 'atlas', atlas, [atlas '_expression.mat']);
+        opt.atlas = fullfile(wd, 'atlas', atlas, sprintf('%s_atlas.nii', atlas));
+        opt.aba_mat = fullfile(wd, 'atlas', atlas, sprintf('%s_expression.mat', atlas));
         load(opt.aba_mat, 'expression_matrix')
         opt.n_rois = height(expression_matrix);
         clear('expression_matrix');
-        fprintf('Setting atlas & ABA data to %s - %u parcels.\n', opt.atlas, opt.n_rois)
+        fprintf('Setting atlas & ABA data to %s - %u parcels.\n', opt.atlas, opt.n_rois);
+
     case 'user'
         % check if custom volume. if yes, unzip if ending *.nii.gz
         try
+            % load & unzip atlas
             [~, ~, atlas_ext] = fileparts(opt.atlas);
             if strcmp(atlas_ext, '.gz')
                 atlas_unzip = gunzip(opt.atlas, fullfile(wd, 'temp'));
@@ -99,18 +107,35 @@
             else
                 error('Filetype of %s is not supported!', opt.phenotype);
             end
-            disp(['Using custom provided atlas: ' opt.atlas]);
-            if ~isfield(opt, 'aba_mat')
-                error('If you provide a custom atlas, you must also provide matching ABA data!')
-            end
         catch
             error(['Atlas not valid. Choose one of the integrated atlases or ', ...
                    'provide a path to a custom atlas volume! ', ...
                    'Run <abannotate_get_datasets("parcellation");> for a list of available atlases.']);
         end
+        % get roi number
+        opt.n_rois = get_number_of_rois(opt.atlas);
+        fprintf('Using custom provided atlas: %s - %u parcels.\n', opt.atlas, opt.n_rois);
+        % check associated ABA expression matrix
+        if ~isfield(opt, 'aba_mat')
+            error('If you provide a custom atlas, you must also provide matching ABA data!')
+        end
+        try
+            load(opt.aba_mat, 'expression_matrix', 'gene_symbols');
+        catch
+            error('Problems loading variables "expression_matrix" and "gene_symbols" from opt.aba_mat!');
+        end
+        % compare numbers
+        if opt.n_rois~=height(expression_matrix)
+            error('Number of ROIs in atlas (%u) does not match number of values in expression matrix (%u)!', opt.n_rois, height(expression_matrix))
+        end
+        if length(gene_symbols)~=width(expression_matrix)
+            error('Length of gene_symbols width of expression_matrix do not match!');
+        end
+        clear('expression_matrix', 'gene_symbols');
 end
 
 % parcellate pheno volume -------------------------------------------------
+
 % if pheno data provided, will use data and ignore volume!
 if ~isfield(opt, 'phenotype_data')
     fprintf('Applying parcellation to %s.\n', opt.phenotype);
@@ -119,25 +144,34 @@
     if strcmp(phenotype_ext, '.gz')
         phenotype_unzip = gunzip(opt.phenotype, fullfile(wd, 'temp'));
     elseif strcmp(phenotype_ext, '.nii')
-        phenotype_unzip = {opt.phenotype};
+        phenotype_unzip = cellstr(opt.phenotype);
     else
         error('Filetype of %s is not supported!', opt.phenotype);
     end
     % parcellate
     [phenotype_parc, ~] = mean_time_course(phenotype_unzip, opt.atlas, 1:opt.n_rois);
     opt.phenotype_data = phenotype_parc';
+
 % else check if input data is provided
 elseif isfield(opt, 'phenotype_data')
-    if size(opt.phenotype_data) ~= [opt.n_rois 1]
-        error('Phenotype data has to be opt.n_rois x 1 vector!');
+    disp('Phenotype data array provided, overriding opt.phenotype with "data".');
+    opt.phenotype = 'data';
+    if ~isequal(size(opt.phenotype_data), [opt.n_rois 1])
+        error('Phenotype data has to be vector of size [%d, 1]!', opt.n_rois);
     end
-    disp('Phenotype data array provided, ignoring opt.phenotype');
+
 % else something did not work
 else
     error('It seems you did not provide opt.phenotype or opt.phenotype_data!'); 
-end
+    end
 
+% check for NaNs
+if any(isnan(opt.phenotype_data))
+    warning('Parcellated phenotype data contains NaNs! Respective regions will be ignored.');
+end
+
 % get phenotype nulls -----------------------------------------------------
+
 % number of phenotype nulls
 if ~isfield(opt, 'n_nulls'); opt.n_nulls = 1000; end
 
@@ -204,22 +238,22 @@
 
 % Dataset -----------------------------------------------------------------
 % default to Gene Ontology - Biological Process
-% case 1: one of integrated datasets
-if isfield(opt.GCEA, 'dataset') && ~strcmp(opt.GCEA.dataset, 'custom')
+% case 1: none set, default to GO-BP
+if ~isfield(opt.GCEA, 'dataset') && ~isfield(opt.GCEA, 'dataset_mat')
+    opt.GCEA.dataset = 'GO-biologicalProcessDirect-discrete';
+    fprintf('No dataset provided, defaulting to: %s\n', opt.GCEA.dataset);
+    % get path or download
+    opt.GCEA.dataset_mat = get_gcea_dataset(opt.GCEA.dataset);
+% case 2: one of integrated datasets
+elseif isfield(opt.GCEA, 'dataset') && ~strcmp(opt.GCEA.dataset, 'custom')
     % get path or download
     opt.GCEA.dataset_mat = get_gcea_dataset(opt.GCEA.dataset);
-    disp(['GCEA dataset: ' opt.GCEA.dataset]);
+    fprintf('GCEA dataset: %s\n', opt.GCEA.dataset);
     validate_gcea_dataset(opt.GCEA.dataset_mat);
-% case 2: a custom mat file
+% case 3: a custom mat file
 elseif strcmp(opt.GCEA.dataset, 'custom') && isfield(opt.GCEA, 'dataset_mat')
-    disp(['Custom GCEA dataset: ' opt.GCEA.dataset_mat]);
+    fprintf('Custom GCEA dataset: %s\n', opt.GCEA.dataset_mat);
     validate_gcea_dataset(opt.GCEA.dataset_mat);
-% case 3: none set, default to GO-BP
-elseif ~isfield(opt.GCEA, 'dataset') && ~isfield(opt.GCEA, 'dataset_mat')
-    opt.GCEA.dataset = 'GO-biologicalProcessDirect-discrete';
-    % get path or download
-    opt.GCEA.dataset_mat = get_gcea_dataset(opt.GCEA.dataset);
-    disp(['No dataset provided, defaulting to: ' opt.GCEA.dataset]);
 % case 4: other
 else 
     disp(['You must provide ether a valid dataset name or set <opt.gcea.dataset> to ', ...
@@ -277,26 +311,26 @@
     cTablePheno_csv = cTablePheno;
     cTablePheno_csv = removevars(cTablePheno_csv, {'cWeights', 'cScoresNull'});
     cTablePheno_csv.cGenes = cellfun(@(x) strjoin(x, ', '), cTablePheno.cGenes, 'UniformOutput', false);
-    writetable(cTablePheno_csv, fullfile(opt.dir_result, ['GCEA_' opt.analysis_name '.csv']));
+    writetable(cTablePheno_csv, fullfile(opt.dir_result, sprintf('GCEA_%s.csv', opt.analysis_name)));
 
     % write mat
-    save(fullfile(opt.dir_result, ['GCEA_' opt.analysis_name '.mat']), 'cTablePheno', 'opt');
+    save(fullfile(opt.dir_result, sprintf('GCEA_%s.mat', opt.analysis_name)), 'cTablePheno', 'opt');
 
     % write setting xml
-    writestruct(opt, fullfile(opt.dir_result, ['GCEA_' opt.analysis_name '_settings.xml']))
+    writestruct(opt, fullfile(opt.dir_result, sprintf('GCEA_%s_settings.xml', opt.analysis_name)))
 
     % copy logfile
-    copyfile(logfile, fullfile(opt.dir_result, ['GCEA_' opt.analysis_name '_log.txt']));
+    copyfile(logfile, fullfile(opt.dir_result, sprintf('GCEA_%s_log.txt', opt.analysis_name)));
 
     % print
-    fprintf('Saved results to %s.\n', fullfile(opt.dir_result, ['GCEA_' opt.analysis_name '*.*']));
+    fprintf('Saved results to %s.\n', fullfile(opt.dir_result, sprintf('GCEA_%s*.*', opt.analysis_name)));
 end
 
 % delete temp data
 diary('off');
 delete(fullfile(tempdir, '*'));
 
-print_section(['Finished ABAnnotate: ' opt.analysis_name]);
+print_section(sprintf('Finished ABAnnotate: %s', opt.analysis_name));
 
 end % function
 

diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@
 
 The method basically consists of the following steps:
 
-1. An input volume ("phenotype") is parcellated according to given parcellation and null models corrected for spatial-autocorrelation are generated.  
+1. An input volume ("phenotype") is parcellated according to given parcellation and null models corrected for spatial autocorrelation are generated (if autocorrelation was detected in the data).  
 2. For each null phenotype and each gene category a "category score" is obtained by correlating the null phenotype with the spatial mRNA expression pattern of each gene and averaging the z-transformed correlation coefficients off all genes annotated to a certain category within each category (= null categories).  
 3. The generated null category scores are then compared to the "real" category score obtained by correlating the "real" phenotype with all genes in each category and averaging the correlation coefficients per category.  
 4. One-sided p values for each category are obtained from the estimated null distribution of category scores and the resulting p-values are FDR-corrected.  

diff --git a/example/example_customization.mlx b/example/example_customization.mlx
diff --git a/scripts/abannotate_delete_datasets.m b/scripts/abannotate_delete_datasets.m
@@ -42,7 +42,7 @@
 if ismember(del, {'Y', 'y', 'yes', 'Yes', 'YES'})
     for i=1:length(files_to_delete)
         file_to_delete = files_to_delete{i};
-        disp(['Deleting file: ' file_to_delete]);
+        fprintf('Deleting file: %s\n', file_to_delete);
         if endsWith(file_to_delete, '.mat')
             delete(file_to_delete); 
         else

diff --git a/scripts/abannotate_delete_null_data.m b/scripts/abannotate_delete_null_data.m
@@ -40,7 +40,7 @@
 if ismember(del, {'Y', 'y', 'yes', 'Yes', 'YES'})
     for i=1:length(files_to_delete)
         fullfile_to_delete = fullfile(null_dir, files_to_delete{i});
-        disp(['Deleting file: ' fullfile_to_delete]);
+        fprintf('Deleting file: %s\n', fullfile_to_delete);
         delete(fullfile_to_delete); 
     end
 else

diff --git a/scripts/abannotate_get_datasets.m b/scripts/abannotate_get_datasets.m
@@ -7,6 +7,8 @@
 
 if ~exist('dataset_type', 'var')
     dataset_type = 'gcea_dataset';
+else
+    dataset_type = char(dataset_type);
 end
 if ~exist('disp_names', 'var')
     disp_names = true;

diff --git a/scripts/aggregate_scores.m b/scripts/aggregate_scores.m
@@ -25,7 +25,7 @@
     case 'absweightedmean'
         category_score = nansum(abs(gene_scores) .* gene_weights, 1) ./ nansum(gene_weights, 1);
     otherwise
-        error('Unknown aggregation option: ''%s''', aggregation_method);
+        error('Unknown aggregation option: "%s"', aggregation_method);
 end
 
 end
diff --git a/scripts/ensemble_enrichment.m b/scripts/ensemble_enrichment.m
@@ -78,7 +78,7 @@
 % Give a basic output about significance 
 fprintf(1,'%u significant categories at p < %.3f derived from gaussian distributions.\n', ...
     sum(cTablePheno.pValZCorr < gcea_opt.p_thresh), gcea_opt.p_thresh);
-fprintf(1,'%u significant categories at p < %.3f derived from null distributions. \n', ...
+fprintf(1,'%u significant categories at p < %.3f derived from null distributions.\n', ...
     sum(cTablePheno.pValPermCorr < gcea_opt.p_thresh), gcea_opt.p_thresh);
 
 end
diff --git a/scripts/generate_category_nulls.m b/scripts/generate_category_nulls.m
@@ -109,7 +109,7 @@
 fprintf('\n');
 
 %--------------------------------------------------------------------------
-% Agglomerate gene scores by GO category
+% Agglomerate gene scores by gene category
 %--------------------------------------------------------------------------
 category_scores = cell(n_categories, 1);
 for i = 1:n_categories
@@ -151,7 +151,7 @@
 % Save results to .mat file
 save_path = gcea_opt.category_nulls;
 if save_result
-    fprintf(1,'Saving nulls from %u iterations to ''%s''\n', n_null, save_path);
+    fprintf(1,'Saving nulls from %u iterations to "%s"\n', n_null, save_path);
     save(save_path, 'cTable', 'gcea_opt', '-v7');
 end
 

diff --git a/scripts/get_gcea_dataset.m b/scripts/get_gcea_dataset.m
@@ -4,6 +4,7 @@
 % Gets path to ABAnnotate GCEA dataset or downloads it from OSF
 
 wd = fileparts(which('Abannotate'));
+dataset_name = char(dataset_name);
 
 % get list of datasets
 dataset_list = abannotate_get_datasets('gcea_dataset', false);
@@ -20,7 +21,7 @@
 if ~exist(dataset_path, 'file')
     disp('Dataset not found. Downloading from OSF...')
     abannotate_download_osf(osf_id, dataset_path);
-    disp(['Saved to: ' dataset_path]);
+    fprintf('Saved to: %s\n', dataset_path);
 end
 
 end

diff --git a/scripts/get_number_of_rois.m b/scripts/get_number_of_rois.m
@@ -0,0 +1,15 @@
+function [n_rois] = get_number_of_rois(vol)
+%function [n_rois] = get_number_of_rois(vol)
+
+vol_hdr = spm_vol(vol);
+vol_data = spm_read_vols(vol_hdr);
+roi_idc = unique(vol_data);
+roi_idc(roi_idc==0) = [];
+
+if ~isequal(roi_idc, round(roi_idc))
+    warning('Parcellation volume seems to contain non-integers!')
+end
+
+n_rois = length(roi_idc);
+
+end
diff --git a/scripts/mean_time_course.m b/scripts/mean_time_course.m
@@ -1,6 +1,8 @@
 function [D, Reg_all] = mean_time_course(data_mean, data_ROI, numberROIs)
-%[D,Reg_all] = mean_time_course(data_for_mean, mask_ROIs,numberROIs)
-
+%[D, Reg_all] = mean_time_course(data_mean, data_ROI, numberROIs)
+%
+% Extract region-wise data from a volume {data_mean} and a parcellation
+% {data_ROI}. {numberROIs} must be a vector of length 1:number of rois.
 
 size_data = size(data_mean);
 [ROI_matrix] = resize_img_useTemp_imcalc(data_ROI,data_mean{1});

diff --git a/scripts/resize_img_useTemp_imcalc.m b/scripts/resize_img_useTemp_imcalc.m
@@ -1,5 +1,5 @@
 function [img3d] = resize_img_useTemp_imcalc(file,temp)
-% function [img3d] = resize_img_useTemp_imcalc(file,temp,opt_save)
+% function [img3d] = resize_img_useTemp_imcalc(file,temp)
 %-----------------------------------------------------------------------
 % Job saved on 15-Jul-2015 09:33:17 by cfg_util (rev $Rev: 6134 $)
 % spm SPM - SPM12 (6225)