diff --git a/diag.bib b/diag.bib index bb08283..c3623b3 100644 --- a/diag.bib +++ b/diag.bib @@ -1367,19 +1367,19 @@ @article{Alex21 doi = {10.1093/cid/ciaa1855}, year = {2021}, abstract = {Abstract - + Background Clinical imaging in suspected invasive fungal disease (IFD) has a significant role in early detection of disease and helps direct further testing and treatment. Revised definitions of IFD from the EORTC/MSGERC were recently published and provide clarity on the role of imaging for the definition of IFD. Here, we provide evidence to support these revised diagnostic guidelines. - - + + Methods We reviewed data on imaging modalities and techniques used to characterize IFDs. - - + + Results Volumetric high-resolution computed tomography (CT) is the method of choice for lung imaging. Although no CT radiologic pattern is pathognomonic of IFD, the halo sign, in the appropriate clinical setting, is highly suggestive of invasive pulmonary aspergillosis (IPA) and associated with specific stages of the disease. The ACS is not specific for IFD and occurs in the later stages of infection. By contrast, the reversed halo sign and the hypodense sign are typical of pulmonary mucormycosis but occur less frequently. In noncancer populations, both invasive pulmonary aspergillosis and mucormycosis are associated with "atypical" nonnodular presentations, including consolidation and ground-glass opacities. - - + + Conclusions A uniform definition of IFD could improve the quality of clinical studies and aid in differentiating IFD from other pathology in clinical practice. Radiologic assessment of the lung is an important component of the diagnostic work-up and management of IFD. Periodic review of imaging studies that characterize findings in patients with IFD will inform future diagnostic guidelines. }, @@ -1952,11 +1952,11 @@ @conference{Aswo19 booktitle = {European Congress of Pathology}, title = {Potential of an AI-based digital biomarker to predict neoadjuvant chemotherapy response from preoperative biopsies of Luminal-B breast cancer}, abstract = {Background & objectives: Invasive breast cancer (IBC) is increasingly treated with neoadjuvant chemotherapy. Yet, only 15-20% of Luminal-B patients achieve pathological complete response (pCR). We developed an AI-based biomarker to predict pCR of Luminal-B IBC from preoperative biopsies stained with H&E. - + Methods: First, we trained a deep learning model on a multi-centric dataset of n=277 manually annotated breast cancer H&E-stained histopathology images to segment tumour, lymphocytes and other tissue. Second, we applied the segmentation model to an independent set of n=297 Luminal-B pre-treatment biopsies. For each case, we computed our biomarker: the proportion of tumour within 80mm distance from lymphocyte regions. - + Results: From the Luminal-B cohort, 32/297 cases (11%) were labelled as "pCR" when no remaining cancer cells were reported for the post-operative surgical resection. The biomarker showed significant (p<<0.01) correlation with pCR with a point biserial correlation coefficient of 0.27. Setting a cut-off value based on the optimal operating point of the ROC curve (AUC=0.69), we reached a sensitivity of 0.53 and a specificity of 0.74. - + Conclusion: The developed deep-learning based biomarker quantifies the proportion of inflammatory tumour regions. It shows promising results for predicting pCR for Luminal-B breast cancer from pre-treatment biopsies stained with H&E.}, optnote = {DIAG, RADIOLOGY}, year = {2019}, @@ -1985,13 +1985,13 @@ @article{Aswo23 abstract = {Abstract Background Invasive breast cancer patients are increasingly being treated with neoadjuvant chemotherapy; however, only a fraction of the patients respond to it completely. To prevent overtreatment, there is an urgent need for biomarkers to predict treatment response before administering the therapy. - + Methods In this retrospective study, we developed hypothesis-driven interpretable biomarkers based on deep learning, to predict the pathological complete response (pCR, i.e., the absence of tumor cells in the surgical resection specimens) to neoadjuvant chemotherapy solely using digital pathology H&E images of pre-treatment breast biopsies. Our approach consists of two steps: First, we use deep learning to characterize aspects of the tumor micro-environment by detecting mitoses and segmenting tissue into several morphology compartments including tumor, lymphocytes and stroma. Second, we derive computational biomarkers from the segmentation and detection output to encode slide-level relationships of components of the tumor microenvironment, such as tumor and mitoses, stroma, and tumor infiltrating lymphocytes(TILs). - + Results We developed and evaluated our method on slides from n=721 patients from three European medical centers with triple-negative and Luminal B breast cancers and performed external independent validation on n=126 patients from a public dataset. We report the predictive value of the investigated biomarkers for predicting pCR with areas under the receiver operating characteristic curve between 0.66 and 0.88 across the tested cohorts. - + Conclusion The proposed computational biomarkers predict pCR, but will require more evaluation and finetuning for clinical application. Our results further corroborate the potential role of deep learning to automate TILs quantification, and their predictive value in breast cancer neoadjuvant treatment planning, along with automated mitoses quantification. We made our method publicly available to extract segmentation-based biomarkers for research purposes.}, automatic = {yes}, @@ -2279,7 +2279,7 @@ @phdthesis{Balk20a title = {Tissue-based biomarker assessment for predicting prognosis of triple negative breast cancer: the additional value of artificial intelligence}, url = {https://repository.ubn.ru.nl/handle/2066/220344}, abstract = {Despite much research, currently still about 1 in 4 patients with TNBC will develop a recurrence after which the survival outlook is very poor. To date, no prognostic markers are available for TNBC to adequately stratify patients for the risk of developing a recurrence. The emergence of powerful computer algorithms, in particular deep learning models, enable more in depth and more extensive biomarker exploration. In addition, these algorithms are objective and reproducible, in contrast to most human visual biomarker assessment. The first aim of this thesis was to establish a well-defined cohort of TNBC, consisting of tissue sections, clinical and pathology data as well as comprehensive follow up data. Secondly, we aimed to evaluate the prognostic value of the mitotic count, which has widespread clinical use as part of the Nottingham grading system. We studied mitotic count both via conventional manual assessment and automatic assessment, to see if we could find a cut-off value which is better tailored for TNBC. Our third aim was to evaluate the prognostic value of TILs, a promising biomarker not yet used in clinical practice. - + To study the prognostic value of biomarkers in TNBC, the following objectives were defined: 1. Establish a multicentre TNBC cohort including tissue sections and follow up data (Chapter 2) 2. Develop a baseline prognostic model for TNBC based on the currently known clinicopathological variables (Chapter 2) @@ -2570,7 +2570,7 @@ @article{Band23 pages = {102755}, volume = {85}, abstract = {Recently, large, high-quality public datasets have led to the development of convolutional neural networks that can detect lymph node metastases of breast cancer at the level of expert pathologists. Many cancers, regardless of the site of origin, can metastasize to lymph nodes. However, collecting and annotating high-volume, high-quality datasets for every cancer type is challenging. In this paper we investigate how to leverage existing high-quality datasets most efficiently in multi-task settings for closely related tasks. Specifically, we will explore different training and domain adaptation strategies, including prevention of catastrophic forgetting, for breast, colon and head-and-neck cancer metastasis detection in lymph nodes. - + Our results show state-of-the-art performance on colon and head-and-neck cancer metastasis detection tasks. We show the effectiveness of adaptation of networks from one cancer type to another to obtain multi-task metastasis detection networks. Furthermore, we show that leveraging existing high-quality datasets can significantly boost performance on new target tasks and that catastrophic forgetting can be effectively mitigated. Last, we compare different mitigation strategies.}, file = {Band23.pdf:pdf\\Band23.pdf:PDF}, journal = {Medical Image Analysis}, @@ -2709,13 +2709,13 @@ @conference{Beck16 booktitle = RSNA, year = {2016}, abstract = {PURPOSE: We aimed to evaluate the additional value of brain {CT} perfusion ({CTP}) for intracranial vessel occlusion detection in acute ischemic stroke for observers with different levels of experience. - + METHOD AND MATERIALS: We retrospectively included all patients with symptoms of acute ischemic stroke (onset of less than 9 hours) who were scanned with non-enhanced {CT} ({NECT}), {CT} angiography ({CTA}) and {CTP} in the year 2015. Four observers with different levels of experience (neuroradiologist, non-neuroradiologist, two radiology residents) evaluated the imaging data with 2 imaging strategies. Method 1 included {NECT} and {CTA}. For method 2, additional {CTP} maps were provided for the evaluation of intracranial vessel occlusion on {CTA}. The observers were blinded to patient identity and clinical outcome. Receiver operating characteristic ({ROC}) was used for the evaluation of accuracy in intracranial vessel occlusion detection. The reference standard of vessel occlusion was set based on the evaluation by the four observers, and the judgment of an independent neuroradiologist serving as a referee in case of discrepancy. - + RESULTS: In total 110 patients were included, preliminary analyses included 94 patients. There was an increase of {AUC} in the overall detection of intracranial vessel occlusion for observer 1, 3 and 4, though only for observer 1 the increase in {AUC} was statistically significant (p=0.041). Increase of intracranial vessel occlusion detection mainly concerned distal vessel occlusions. No significant added value of {CTP} was found for proximal vessel occlusions, with already a high accuracy based on {NECT} and {CTA} for all experience levels with sensitivity ranging between 86-94% and specificity between 92-100%. - + CONCLUSION: Our study demonstrates that the use of {CTP} can aid in the detection of distal intracranial vessel occlusions on {CTA} in case {CTP} is integrated in the reading strategy. It is also demonstrated that {CTP} was not of added value for the detection of proximal intracranial vessel occlusions. Finally, there was no major difference in the diagnostic accuracy of intracranial vessel occlusion detection for the different levels in experience of the observers. - + CLINICAL RELEVANCE/APPLICATION: Our study demonstrated that brain {CT} perfusion can aid in the detection of distal intracranial vessel occlusions, which is clinically relevant for optimizing the imaging strategy in acute ischemic stroke.}, optnote = {DIAG, RADIOLOGY}, } @@ -2730,11 +2730,11 @@ @article{Beck19a pages = {124-129}, doi = {10.1016/j.neurad.2018.03.003}, abstract = {Background and purpose: To evaluate whether brain CT perfusion (CTP) aids in the detection of intracranial vessel occlusion on CT angiography (CTA) in acute ischemic stroke. - + Materials and methods: Medical-ethical committee approval of our hospital was obtained and informed consent was waived. Patients suspected of acute ischemic stroke who underwent non-contrast CT(NCCT), CTA and whole-brain CTP in our center in the year 2015 were included. Three observers with different levels of experience evaluated the imaging data of 110 patients for the presence or absence of intracranial arterial vessel occlusion with two strategies. In the first strategy, only NCCT and CTA were available. In the second strategy, CTP maps were provided in addition to NCCT and CTA. Receiver-operating-characteristic (ROC) analysis was used for the evaluation of diagnostic accuracy. - + Results: Overall, a brain perfusion deficit was scored present in 87-89% of the patients with an intracranial vessel occlusion, more frequently observed in the anterior than in the posterior circulation. Performance of intracranial vessel occlusion detection on CTA was significantly improved with the availability of CTP maps as compared to the first strategy (P = 0.023), due to improved detection of distal and posterior circulation vessel occlusions (P-values of 0.032 and 0.003 respectively). No added value of CTP was found for intracranial proximal vessel occlusion detection, with already high accuracy based on NCCT and CTA alone. - + Conclusion: The performance of intracranial vessel occlusion detection on CTA was improved with the availability of brain CT perfusion maps due to the improved detection of distal and posterior circulation vessel occlusions.}, file = {pdf\\Beck19a.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -2854,7 +2854,7 @@ @phdthesis{Bejn17a title = {Histopathological diagnosis of breast cancer using machine learning}, url = {https://repository.ubn.ru.nl/handle/2066/178907}, abstract = {Application of machine learning to WSI is a promising yet largely unexplored field of research. The primary aim of the research described in this thesis was to develop automated systems for analysis of H&E stained breast histopathological images. This involved automatic detection of ductal carcinoma in-situ (DCIS), invasive, and metastatic breast cancer in whole-slide histopathological images. A secondary aim was to identify new diagnostic biomarkers for the detection of invasive breast cancer. To this end the research was undertaken with the following objectives: - + 1. Development of an algorithm for standardization of H&E stained WSIs; 2. Detection, classification and segmentation of primary breast cancer; 3. Evaluation of the state of the art of machine learning algorithms for automatic detection of lymph nodes metastases; @@ -3212,13 +3212,13 @@ @article{Blek19 doi = {https://doi.org/10.1007/s00330-019-06488-y}, abstract = {Objectives To create a radiomics approach based on multiparametric magnetic resonance imaging (mpMRI) features extracted from an auto-fixed volume of interest (VOI) that quantifies the phenotype of clinically significant (CS) peripheral zone (PZ) prostate cancer (PCa). - + Methods This study included 206 patients with 262 prospectively called mpMRI prostate imaging reporting and data system 3-5 PZ lesions. Gleason scores > 6 were defined as CS PCa. Features were extracted with an auto-fixed 12-mm spherical VOI placed around a pin point in each lesion. The value of dynamic contrast-enhanced imaging(DCE), multivariate feature selection and extreme gradient boosting (XGB) vs. univariate feature selection and random forest (RF), expert-based feature pre-selection, and the addition of image filters was investigated using the training (171 lesions) and test (91 lesions) datasets. - + Results The best model with features from T2-weighted (T2-w) + diffusion-weighted imaging (DWI) + DCE had an area under the curve (AUC) of 0.870 (95% CI 0.980-0.754). Removal of DCE features decreased AUC to 0.816 (95% CI 0.920-0.710), although not significantly (p = 0.119). Multivariate and XGB outperformed univariate and RF (p = 0.028). Expert-based feature pre-selection and image filters had no significant contribution. - + Conclusions The phenotype of CS PZ PCa lesions can be quantified using a radiomics approach based on features extracted from T2-w + DWI using an auto-fixed VOI. Although DCE features improve diagnostic performance, this is not statistically significant. Multivariate feature selection and XGB should be preferred over univariate feature selection and RF. The developed model may be a valuable addition to traditional visual assessment in diagnosing CS PZ PCa.}, file = {:pdf/Blek19.pdf:PDF}, @@ -3241,13 +3241,13 @@ @article{Blek21 optnote = {DIAG, RADIOLOGY}, abstract = {Objectives To investigate a previously developed radiomics-based biparametric magnetic resonance imaging (bpMRI) approach for discrimination of clinically significant peripheral zone prostate cancer (PZ csPCa) using multi-center, multi-vendor (McMv) and single-center, single-vendor (ScSv) datasets. - + Methods This study's starting point was a previously developed ScSv algorithm for PZ csPCa whose performance was demonstrated in a single-center dataset. A McMv dataset was collected, and 262 PZ PCa lesions (9 centers, 2 vendors) were selected to identically develop a multi-center algorithm. The single-center algorithm was then applied to the multi-center dataset (single-multi-validation), and the McMv algorithm was applied to both the multi-center dataset (multi-multi-validation) and the previously used single-center dataset (multi-single-validation). The areas under the curve (AUCs) of the validations were compared using bootstrapping. - + Results Previously the single-single validation achieved an AUC of 0.82 (95% CI 0.71-0.92), a significant performance reduction of 27.2% compared to the single-multi-validation AUC of 0.59 (95% CI 0.51-0.68). The new multi-center model achieved a multi-multi-validation AUC of 0.75 (95% CI 0.64-0.84). Compared to the multi-single-validation AUC of 0.66 (95% CI 0.56-0.75), the performance did not decrease significantly (p value: 0.114). Bootstrapped comparison showed similar single-center performances and a significantly different multi-center performance (p values: 0.03, 0.012). - + Conclusions A single-center trained radiomics-based bpMRI model does not generalize to multi-center data. Multi-center trained radiomics-based bpMRI models do generalize, have equal single-center performance and perform better on multi-center data.}, taverne_url = {https://repository.ubn.ru.nl/handle/2066/239809}, @@ -3556,13 +3556,13 @@ @conference{Bokh20 title = {Computer-assisted hot-spot selection for tumor budding assessment in colorectal cancer}, abstract = {Background & objectives Tumor budding (TB) is an established prognosticator for colorectal cancer. Detection of the hot-spot to score TB is based on visual inspection, hindering reproducibility of this important factor. We present an algorithm that can potentially assist pathologists in this task. - + Methods We used a previously developed algorithm for the detection of tumor buds in pan-cytokeratin stained whole slide images, calculating the number of buds for each location using a circle with 0.785mm2 surface area. From these numbers, density heatmaps were produced. The algorithm was applied to 270 slides from Bern University hospital, in which hot-spots and tumor buds were visually identified. - + Results Heat maps were created and we located the hand-selected hotspot and noted the associated TB number. The differences and similarities between computer identified and manually selected hot-spots were visually assessed as well as via histograms. Preliminary results show that the heatmaps are helpful, as locations with the highest TB density (the top 15%) also include the hand-selected hotspots. The full results will be presented during the conference. - + Conclusion The presented algorithm can assist the pathologist in selecting the hot-spot with the highest tumor bud count with more ease at low magnification and can help to reduce the high interobserver variability among pathologists in scoring tumor budding.}, optnote = {DIAG}, @@ -3575,13 +3575,13 @@ @conference{Bokh20a title = {Deep learning based tumor bud detection in pan-cytokeratin stained colorectal cancer whole-slide images}, abstract = {Background & objectives Tumor budding (TB) is an established prognosticator for colorectal cancer. Deep learning based TB assessment has the potential to improve diagnostic reproducibility and efficiency. We developed an algorithm that can detect individual tumor buds in pan-cytokeratin stained colorectal cancer slides - + Methods Tumor-bud candidates (n=1765, collected from 58 whole slide images; WSI) were labeled by seven experts as either TB, poorly differentiated cluster, or neither. The 58 slides were randomly split into a training (49) and test-set (9). A deep learning (DL) model was trained using the buds identified by the experts in the training set. - + Results The algorithm was tested on the nine remaining WSI and 270 WSI from pan-cytokeratin stained slides from Bern University hospital, in which hot spots and TB were manually scored. An F1 score of 0.82 was found for correspondence at the bud level between experts and DL. A correlation of 0.745 was found between the manually counted buds within the hotspots and the automated method in the 270 WSIs. - + Conclusion Assessment of tumor budding as a prognostic factor for colorectal cancer can be automated using deep learning. At the level of individual tumor buds, correspondence between DL and experts is high and comparable to the inter-rater variability. However, compared to the manual procedure, the algorithm yields higher counts for cases with relatively high bud densities (>15). Follow-up studies will focus on the assessment of TB in H&E stained slides.}, optnote = {DIAG}, @@ -3866,7 +3866,7 @@ @article{Bort21 url = {https://arxiv.org/abs/2006.06356}, author = {Bortsova, Gerda and Gonz\'{a}lez-Gonzalo, Cristina and Wetstein, Suzanne C. and Dubost, Florian and Katramados, Ioannis and Hogeweg, Laurens and Liefers, Bart and van Ginneken, Bram and Pluim, Josien P.W. and Veta, Mitko and S\'{a}nchez, Clara I. and de Bruijne, Marleen}, abstract = {Adversarial attacks are considered a potentially serious security threat for machine learning systems. Medical image analysis (MedIA) systems have recently been argued to be vulnerable to adversarial attacks due to strong financial incentives and the associated technological infrastructure. In this paper, we study previously unexplored factors affecting adversarial attack vulnerability of deep learning MedIA systems in three medical domains: ophthalmology, radiology, and pathology. We focus on adversarial black-box settings, in which the attacker does not have full access to the target model and usually uses another model, commonly referred to as surrogate model, to craft adversarial examples that are then transferred to the target model. We consider this to be the most realistic scenario for MedIA systems. Firstly, we study the effect of weight initialization (pre-training on ImageNet or random initialization) on the transferability of adversarial attacks from the surrogate model to the target model, i.e., how effective attacks crafted using the surrogate model are on the target model. Secondly, we study the influence of differences in development (training and validation) data between target and surrogate models. We further study the interaction of weight initialization and data differences with differences in model architecture. All experiments were done with a perturbation degree tuned to ensure maximal transferability at minimal visual perceptibility of the attacks. Our experiments show that pre-training may dramatically increase the transferability of adversarial examples, even when the target and surrogate's architectures are different: the larger the performance gain using pre-training, the larger the transferability. Differences in the development data between target and surrogate models considerably decrease the performance of the attack; this decrease is further amplified by difference in the model architecture. We believe these factors should be considered when developing security-critical MedIA systems planned to be deployed in clinical practice. We recommend avoiding using only standard components, such as pre-trained architectures and publicly available datasets, as well as disclosure of design specifications, in addition to using adversarial defense methods. When evaluating the vulnerability of MedIA systems to adversarial attacks, various attack scenarios and target-surrogate differences should be simulated to achieve realistic robustness estimates. The code and all trained models used in our experiments are publicly available. - + (The first three authors contributed equally to this work.)}, publisher = {Elsevier}, optnote = {DIAG}, @@ -4076,16 +4076,16 @@ @article{Bozo17 abstract = { Purpose: The aim of the study was to retrospectively evaluate the diagnostic imaging that potential lung donors undergo, the reader variability of image interpretation and its relevance for donation, and the potential information gained from imaging studies not primarily intended for lung evaluation but partially including them. - - + + Materials and Methods: Bedside chest radiography and computed tomography (CT), completely or incompletely including the lungs, of 110 brain-dead potential organ donors in a single institution during 2007 to 2014 were reviewed from a donation perspective. Two chest radiologists in consensus analyzed catheters and cardiovascular, parenchymal, and pleural findings. Clinical reports and study review were compared for substantial differences in findings that could have led to a treatment change, triggered additional examinations such as bronchoscopy, or were considered important for donation. - - + + Results: Among 136 bedside chest radiographs, no differences between clinical reports and study reviews were found in 37 (27%), minor differences were found in 28 (21%), and substantial differences were found in 71 (52%) examinations (P<0.0001). In 31 of 42 (74%) complete or incomplete CT examinations, 50 of 74 findings with relevance for lung donation were not primarily reported (P<0.0001). - - + + Conclusions: The majority of donor patients undergo only chest radiography. A targeted imaging review of abnormalities affecting the decision to use donor lungs may be useful in the preoperative stage. With a targeted list, substantial changes were made from initial clinical interpretations. CT can provide valuable information on donor lung pathology, even if the lungs are only partially imaged. }, @@ -4836,16 +4836,16 @@ @article{Bult20 algorithm = {https://grand-challenge.org/algorithms/gleason-grading-of-prostate-biopsies/}, abstract = {BACKGROUND: The Gleason score is the strongest correlating predictor of recurrence for prostate cancer, but has substantial inter-observer variability, limiting its usefulness for individual patients. Specialised urological pathologists have greater concordance; however, such expertise is not widely available. Prostate cancer diagnostics could thus benefit from robust, reproducible Gleason grading. We aimed to investigate the potential of deep learning to perform automated Gleason grading of prostate biopsies. - + METHODS: In this retrospective study, we developed a deep-learning system to grade prostate biopsies following the Gleason grading standard. The system was developed using randomly selected biopsies, sampled by the biopsy Gleason score, from patients at the Radboud University Medical Center (pathology report dated between Jan 1, 2012, and Dec 31, 2017). A semi-automatic labelling technique was used to circumvent the need for manual annotations by pathologists, using pathologists' reports as the reference standard during training. The system was developed to delineate individual glands, assign Gleason growth patterns, and determine the biopsy-level grade. For validation of the method, a consensus reference standard was set by three expert urological pathologists on an independent test set of 550 biopsies. Of these 550, 100 were used in an observer experiment, in which the system, 13 pathologists, and two pathologists in training were compared with respect to the reference standard. The system was also compared to an external test dataset of 886 cores, which contained 245 cores from a different centre that were independently graded by two pathologists. - + FINDINGS: We collected 5759 biopsies from 1243 patients. The developed system achieved a high agreement with the reference standard (quadratic Cohen's kappa 0*918, 95% CI 0*891-0*941) and scored highly at clinical decision thresholds: benign versus malignant (area under the curve 0*990, 95% CI 0*982-0*996), grade group of 2 or more (0*978, 0*966-0*988), and grade group of 3 or more (0*974, 0*962-0*984). In an observer experiment, the deep-learning system scored higher (kappa 0*854) than the panel (median kappa 0*819), outperforming 10 of 15 pathologist observers. On the external test dataset, the system obtained a high agreement with the reference standard set independently by two pathologists (quadratic Cohen's kappa 0*723 and 0*707) and within inter-observer variability (kappa 0*71). - + INTERPRETATION: Our automated deep-learning system achieved a performance similar to pathologists for Gleason grading and could potentially contribute to prostate cancer diagnosis. The system could potentially assist pathologists by screening biopsies, providing second opinions on grade group, and presenting quantitative measurements of volume percentages. - + FUNDING: Dutch Cancer Society.}, file = {:pdf/Bult20.pdf:PDF}, @@ -4899,7 +4899,7 @@ @phdthesis{Bult22a year = {2022}, url = {https://repository.ubn.ru.nl/handle/2066/241550}, abstract = {The histological grading of prostate biopsies is a crucial element in the diagnostic pathway of prostate cancer. The known high inter- and intraobserver variability show potential and a need for assisting pathologists in this task. Furthermore, a global shortage of pathologists stresses the demand for reproducible, more efficient, and easily accessible diagnostic solutions. This thesis's primary aim was to investigate and design an AI-based system to detect and grade prostate cancer in biopsies. A second aim was to evaluate the potential clinical merits of AI-assisted grading when such systems are embedded in the pathologist's workflow. To this extent, the following objectives were undertaken as part of this thesis: - + 1. The development of an automated system that can distinguish epithelial tissue from other tissue types within H&E stained prostate specimens (Chapter 2); 2. The development and validation of an automated system for grading prostate biopsies using the Gleason grading system (Chapter 3); 3. A multi-center independent evaluation of state-of-the-art algorithms for automated Gleason grading sourced through a large-scale medical AI competition(Chapter 4); @@ -5298,7 +5298,7 @@ @article{Char16c doi = {10.1016/j.media.2016.11.001}, url = {http://dx.doi.org/10.1016/j.media.2016.11.001}, abstract = {We propose a novel method to improve airway segmentation in thoracic computed tomography (CT) by detecting and removing leaks. Leak detection is formulated as a classification problem, in which a convolutional network (ConvNet) is trained in a supervised fashion to perform the classification task. In order to increase the segmented airway tree length, we take advantage of the fact that multiple segmentations can be extracted from a given airway segmentation algorithm by varying the parameters that influence the tree length and the amount of leaks. We propose a strategy in which the combination of these segmentations after removing leaks can increase the airway tree length while limiting the amount of leaks. This strategy therefore largely circumvents the need for parameter fine-tuning of a given airway segmentation algorithm. - + The ConvNet was trained and evaluated using a subset of inspiratory thoracic CT scans taken from the COPDGene study. Our method was validated on a separate independent set of the EXACT?09 challenge. We show that our method significantly improves the quality of a given leaky airway segmentation, achieving a higher sensitivity at a low false-positive rate compared to all the state-of-the-art methods that entered in EXACT09, and approaching the performance of the combination of all of them.}, file = {Char16c.pdf:pdf\\Char16c.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -5468,7 +5468,7 @@ @phdthesis{Chle22 abstract = {This thesis is devoted to the applications of deep learning segmentation algorithms to multimodal abdominal imaging. It focuses on the segmentation of liver, prostate, and liver tumors in CT and MRI images. It aims not only to propose and evaluate new segmentation architectures, but also to investigate aspects such as the required time for the correction of automatic segmentation results, the impact on the inter-observer variability, and the optimization of annotation effort. The following objectives were undertaken as part of this thesis: - + 1. The development of a two-stage cascade system for liver and liver tumor segmentation in CT images (Chapter 2); 2. The development of an ensemble of three orthogonal 2D CNNs for liver segmentation in late-phase T1W MRI images (Chapter 3); 3. The investigation of various active learning strategies to optimally select a set of CT slices to obtain the best possible liver segmentation method in CT without the need to manually annotate a large amount of training data (Chapter 4); @@ -6834,19 +6834,19 @@ @article{Donn19 doi = {10.1093/cid/ciz1008}, year = {2019}, abstract = {Abstract - + Background Invasive fungal diseases (IFDs) remain important causes of morbidity and mortality. The consensus definitions of the Infectious Diseases Group of the European Organization for Research and Treatment of Cancer and the Mycoses Study Group have been of immense value to researchers who conduct clinical trials of antifungals, assess diagnostic tests, and undertake epidemiologic studies. However, their utility has not extended beyond patients with cancer or recipients of stem cell or solid organ transplants. With newer diagnostic techniques available, it was clear that an update of these definitions was essential. - - + + Methods To achieve this, 10 working groups looked closely at imaging, laboratory diagnosis, and special populations at risk of IFD. A final version of the manuscript was agreed upon after the groups' findings were presented at a scientific symposium and after a 3-month period for public comment. There were several rounds of discussion before a final version of the manuscript was approved. - - + + Results There is no change in the classifications of "proven," "probable," and "possible" IFD, although the definition of "probable" has been expanded and the scope of the category "possible" has been diminished. The category of proven IFD can apply to any patient, regardless of whether the patient is immunocompromised. The probable and possible categories are proposed for immunocompromised patients only, except for endemic mycoses. - - + + Conclusions These updated definitions of IFDs should prove applicable in clinical, diagnostic, and epidemiologic research of a broader range of patients at high-risk. }, @@ -6981,7 +6981,7 @@ @conference{Eeke22 booktitle = {ECP}, year = {2022}, abstract = {Artificial intelligence (AI) based quantification of cell-level PD-L1 status enables spatial analysis and allows reliable and reproducible assessment of the tumor proportion score. In this study, we assess the cell-level inter-pathologist agreement as human benchmark for AI development and validation. Three pathologists manually annotated the centers of all nuclei within 53 regions of interest in 12 whole- slide images (40X magnification) of NSCLC cases and classified them as PD-L1 negative/positive tumor cells, PD-L1 positive immune cells or other cells. Agreement was quantified using F1 score analysis, with agreement defined as annotations less than 10 um apart and of the same class. An average of 9044 nuclei (1550 negative, 2367 positive tumor cells, 1244 positive immune cells, 3881 other cells) were manually annotated by the three pathologists. The mean F1 score over pairs of pathologists at dataset level was 0.59 (range 0.54-0.65). When split across classes, the mean per-pair F1 scores stay approximately the same, indicating the readers perform similarly regardless of cell type. Besides human variability in manual point annotations with respect to the center of nuclei, lack of context contributed to disagreement: readers who reported they solely examined the ROIs tended to disagree more with readers that reported they also looked outside the ROIs for additional (morphological/density) information. - + In conclusion, agreement on determining the PD-L1 status of individual cells is only moderate, suggesting a role for AI. By quantifying the inter-rater agreement of pathologists, we have created a human benchmark which may serve as an upper bound (and could be combined via majority vote) for the validation of AI at celllevel, something not done previously. Cell-level AI-based assessment of PD-L1 may supersede slide level scoring, adding significant information on the heterogeneity and spatial distribution over the tumor.}, optnote = {DIAG, RADIOLOGY}, } @@ -6992,9 +6992,9 @@ @conference{Eeke22a booktitle = {ECP}, year = {2022}, abstract = {Nuclei detection in histopathology images is an important prerequisite step of downstream research and clinical analyses, such as counting cells and spatial interactions. In this study, we developed an AI-based nuclei detector using the YOLOv5 framework in whole-slide NSCLC cases. Our dataset consisted of 42 PD-L1 stained cases (30 training, 12 test). Four trained (non-expert) readers manually annotated all nuclei (both positive/negative) within regions of interest (ROIs) viewed at 40X magnification. We trained a YOLOv5(s) network on annotations of one reader. Performance was measured using F1 score analysis; hits were defined as being less than 10 um away from annotations. - + We evaluate YOLOv5 on the test set by pairing it against all four readers separately. There, YOLOv5 performs excellently, falling within the interrater variability of the four readers: the mean F1 score over algorithm-reader pairs is 0.84 (range 0.76-0.92) while the mean F1 score over pairs of readers is 0.82 (range 0.76-0.86). When we determine the cell count (number of annotations/predictions) per ROI in the test set, agreement of algorithm-reader pairs and reader pairs is equally well aligned: 0.93 (range 0.90-0.97) versus 0.94 (range 0.92-0.96). Visual inspection indicates YOLOv5 performs equally well on PD-L1 positive and negative cells. - + In future work, we could extend this detector to additional tissues and immunohistochemistry stainings. Moreover, this detector could be used as a AI-assisted manual point annotation tool: while human readers perform the (context-driven) task of delineating homogeneous regions (e.g. clusters of PD-L1positive stained cells), the detector performs the (local, yet laborious) task of identifying individual nuclei within these regions, providing labelled point annotations.}, optnote = {DIAG, RADIOLOGY}, } @@ -7076,13 +7076,13 @@ @article{Eerd21a year = {2021}, abstract = {Abstract Introduction In order to augment the certainty of the radiological interpretation of "possible microbleeds" after traumatic brain injury (TBI), we assessed their longitudinal evolution on 3-T SWI in patients with moderate/severe TBI. - + Methods Standardized 3-T SWI and T1-weighted imaging were obtained 3 and 26 weeks after TBI in 31 patients. Their microbleeds were computer-aided detected and classified by a neuroradiologist as no, possible, or definite at baseline and follow-up, separately (single-scan evaluation). Thereafter, the classifications were re-evaluated after comparison between the time-points (post-comparison evaluation). We selected the possible microbleeds at baseline at single-scan evaluation and recorded their post-comparison classification at follow-up. - + Results Of the 1038 microbleeds at baseline, 173 were possible microbleeds. Of these, 53.8% corresponded to no microbleed at follow-up. At follow-up, 30.6% were possible and 15.6% were definite. Of the 120 differences between baseline and follow-up, 10% showed evidence of a pathophysiological change over time. Proximity to extra-axial injury and proximity to definite microbleeds were independently predictive of becoming a definite microbleed at follow-up. The reclassification level differed between anatomical locations. - + Conclusions Our findings support disregarding possible microbleeds in the absence of clinical consequences. In selected cases, however, a follow-up SWI-scan could be considered to exclude evolution into a definite microbleed. }, @@ -7515,11 +7515,11 @@ @conference{Enge19 booktitle = ARVO, title = {Automatic Segmentation of Drusen and Exudates on Color Fundus Images using Generative Adversarial Networks}, abstract = {Purpose: The presence of drusen and exudates, visible as bright lesions on color fundus images, is one of the early signs of visual threatening diseases such as Age-related Macular Degeneration and Diabetic Retinopathy. Accurate detection and quantification of these lesions during screening can help identify patients that would benefit from treatment. We developed a method based on generative adversarial networks (GANs) to segment bright lesions on color fundus images. - + Methods: We used 4179 color fundus images that were acquired during clinical routine. The images were contrast enhanced to increase the contrast between bright lesions and the background. All bright lesions were manually annotated by marking the center point of the lesions. The GAN was trained to estimate the image without bright lesions. The final segmentation was obtained by taking the difference between the input image and the estimated output. - + Results: This method was applied to an independent test set of 52 color fundus images with non-advanced stages of AMD from the European Genetic Database, which were fully segmented for bright lesions by two trained human observers. The method achieved Dice scores of 0.4862 and 0.4849 when compared to the observers, whereas the inter-observer Dice score was 0.5043. The total segmented bright lesion area per image was evaluated using the intraclass correlation (ICC). The method scored 0.8537 and 0.8352 when compared to the observers, whereas the inter-observer ICC was 0.8893. - + Conclusions: The results show the performance is close to the agreement between trained observers. This automatic segmentation of bright lesions can help early diagnosis of visual threatening diseases and opens the way for large scale clinical trials.}, optnote = {DIAG, RADIOLOGY}, year = {2019}, @@ -7872,7 +7872,7 @@ @article{Four21 abstract = { Abstract Existing quantitative imaging biomarkers (QIBs) are associated with known biological tissue characteristics and follow a well-understood path of technical, biological and clinical validation before incorporation into clinical trials. In radiomics, novel data-driven processes extract numerous visually imperceptible statistical features from the imaging data with no a priori assumptions on their correlation with biological processes. The selection of relevant features (radiomic signature) and incorporation into clinical trials therefore requires additional considerations to ensure meaningful imaging endpoints. Also, the number of radiomic features tested means that power calculations would result in sample sizes impossible to achieve within clinical trials. This article examines how the process of standardising and validating data-driven imaging biomarkers differs from those based on biological associations. Radiomic signatures are best developed initially on datasets that represent diversity of acquisition protocols as well as diversity of disease and of normal findings, rather than within clinical trials with standardised and optimised protocols as this would risk the selection of radiomic features being linked to the imaging process rather than the pathology. Normalisation through discretisation and feature harmonisation are essential pre-processing steps. Biological correlation may be performed after the technical and clinical validity of a radiomic signature is established, but is not mandatory. Feature selection may be part of discovery within a radiomics-specific trial or represent exploratory endpoints within an established trial; a previously validated radiomic signature may even be used as a primary/secondary endpoint, particularly if associations are demonstrated with specific biological processes and pathways being targeted within clinical trials. - + Key Points * Data-driven processes like radiomics risk false discoveries due to high-dimensionality of the dataset compared to sample size, making adequate diversity of the data, cross-validation and external validation essential to mitigate the risks of spurious associations and overfitting. * Use of radiomic signatures within clinical trials requires multistep standardisation of image acquisition, image analysis and data mining processes. @@ -8590,13 +8590,13 @@ @article{Ghaf16a doi = {10.1118/1.4966029}, abstract = {Purpose: White matter hyperintensities (WMH) are seen on FLAIR-MRI in several neurological disorders, including multiple sclerosis, dementia, Parkinsonism, stroke and cerebral small vessel disease (SVD). WMHs are often used as biomarkers for prognosis or disease progression in these diseases, and additionally longitudinal quantification of WMHs is used to evaluate therapeutic strategies. Human readers show considerable disagreement and inconsistency on detection of small lesions. A multitude of automated detection algorithms for WMHs exists, but since most of the current automated approaches are tuned to optimize segmentation performance according to Jaccard or Dice scores, smaller WMHs often go undetected in these approaches. In this paper, the authors propose a method to accurately detect all WMHs, large as well as small. - + Methods: A two-stage learning approach was used to discriminate WMHs from normal brain tissue. Since small and larger WMHs have quite a different appearance, the authors have trained two probabilistic classifiers: one for the small WMHs (<3 mm effective diameter) and one for the larger WMHs (>3 mm in-plane effective diameter). For each size-specific classifier, an Adaboost is trained for five iterations, with random forests as the basic classifier. The feature sets consist of 22 features including intensities, location information, blob detectors, and second order derivatives. The outcomes of the two first-stage classifiers were combined into a single WMH likelihood by a second-stage classifier. Their method was trained and evaluated on a dataset with MRI scans of 362 SVD patients (312 subjects for training and validation annotated by one and 50 for testing annotated by two trained raters). To analyze performance on the separate test set, the authors performed a free-response receiving operating characteristic (FROC) analysis, instead of using segmentation based methods that tend to ignore the contribution of small WMHs. - + Results: Experimental results based on FROC analysis demonstrated a close performance of the proposed computer aided detection (CAD) system to human readers. While an independent reader had 0.78 sensitivity with 28 false positives per volume on average, their proposed CAD system reaches a sensitivity of 0.73 with the same number of false positives. - + Conclusions: The authors have developed a CAD system with all its ingredients being optimized for a better detection of WMHs of all size, which shows performance close to an independent reader.}, file = {Ghaf16a.pdf:pdf\\Ghaf16a.pdf:PDF}, @@ -8727,9 +8727,9 @@ @article{Gibs17 doi = {10.1016/j.media.2017.07.004}, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5666910/}, abstract = {Segmentation algorithms are typically evaluated by comparison to an accepted reference standard. The cost of generating accurate reference standards for medical image segmentation can be substantial. Since the study cost and the likelihood of detecting a clinically meaningful difference in accuracy both depend on the size and on the quality of the study reference standard, balancing these trade-offs supports the efficient use of research resources. - + In this work, we derive a statistical power calculation that enables researchers to estimate the appropriate sample size to detect clinically meaningful differences in segmentation accuracy (i.e. the proportion of voxels matching the reference standard) between two algorithms. Furthermore, we derive a formula to relate reference standard errors to their effect on the sample sizes of studies using lower-quality (but potentially more affordable and practically available) reference standards. - + The accuracy of the derived sample size formula was estimated through Monte Carlo simulation, demonstrating, with 95% confidence, a predicted statistical power within 4% of simulated values across a range of model parameters. This corresponds to sample size errors of less than 4 subjects and errors in the detectable accuracy difference less than 0.6%. The applicability of the formula to real-world data was assessed using bootstrap resampling simulations for pairs of algorithms from the PROMISE12 prostate MR segmentation challenge data set. The model predicted the simulated power for the majority of algorithm pairs within 4% for simulated experiments using a high-quality reference standard and within 6% for simulated experiments using a low-quality reference standard. A case study, also based on the PROMISE12 data, illustrates using the formulae to evaluate whether to use a lower-quality reference standard in a prostate segmentation study.}, file = {Gibs17.pdf:pdf\\Gibs17.pdf:PDF}, optnote = {DIAG}, @@ -9523,7 +9523,7 @@ @conference{Ginn18b year = {2018}, doi = {10.5334/jbsr.1656}, abstract = {Artificial intelligence (AI), particularly deep learning, is currently at the top of the hype cycle. Application of this technology to the analysis of medical images is attracting a lot of attention worldwide. - + At the same time, the average radiologist is using very little to no AI tools in her daily practice. This lecture provides a brief explanation of deep learning and explains what makes this technology different from previous approaches and why it is so powerful. A number of AI applications, some in use that were developed and commercialized in our research group, are presented. These applications serve as examples to define a number of different types of AI products that differ in the way they are placed in (or outside) the workflow of radiologists. This lecture emphasizes how some of these tools replace (a small part of the work of) radiologists, while other augment radiologists, and yet others take the radiologists out of the loop in the care cycle of the patient. Finally, it is discussed how radiologists can, and should, be involved in the development of real-life AI applications.}, file = {Ginn18b.pdf:pdf\\Ginn18b.pdf:PDF}, optnote = {DIAG}, @@ -9678,10 +9678,10 @@ @article{Glas23 abstract = {Abstract Background Chest X-ray offers high sensitivity and acceptable specificity as a tuberculosis screening tool, but in areas with a high burden of tuberculosis, there is often a lack of radiological expertise to interpret chest X-ray. Computer-aided detection systems based on artificial intelligence are therefore increasingly used to screen for tuberculosis-related abnormalities on digital chest radiographies. The CAD4TB software has previously been shown to demonstrate high sensitivity for chest X-ray tuberculosis-related abnormalities, but it is not yet calibrated for the detection of non-tuberculosis abnormalities. When screening for tuberculosis, users of computer-aided detection need to be aware that other chest pathologies are likely to be as prevalent as, or more prevalent than, active tuberculosis. However, non-tuberculosis chest X-ray abnormalities detected during chest X-ray screening for tuberculosis remain poorly characterized in the sub-Saharan African setting, with only minimal literature. - + Case presentation In this case series, we report on four cases with non-tuberculosis abnormalities detected on CXR in TB TRIAGE + ACCURACY (ClinicalTrials.gov Identifier: NCT04666311), a study in adult presumptive tuberculosis cases at health facilities in Lesotho and South Africa to determine the diagnostic accuracy of two potential tuberculosis triage tests: computer-aided detection (CAD4TB v7, Delft, the Netherlands) and C-reactive protein (Alere Afinion, USA). The four Black African participants presented with the following chest X-ray abnormalities: a 59-year-old woman with pulmonary arteriovenous malformation, a 28-year-old man with pneumothorax, a 20-year-old man with massive bronchiectasis, and a 47-year-old woman with aspergilloma. - + Conclusions Solely using chest X-ray computer-aided detection systems based on artificial intelligence as a tuberculosis screening strategy in sub-Saharan Africa comes with benefits, but also risks. Due to the limitation of CAD4TB for non-tuberculosis-abnormality identification, the computer-aided detection software may miss significant chest X-ray abnormalities that require treatment, as exemplified in our four cases. Increased data collection, characterization of non-tuberculosis anomalies and research on the implications of these diseases for individuals and health systems in sub-Saharan Africa is needed to help improve existing artificial intelligence software programs and their use in countries with high tuberculosis burden. }, @@ -9745,11 +9745,11 @@ @conference{Gome17 booktitle = ARVO, year = {2017}, abstract = {Purpose : To assess the performance of deep learning architectures based on convolutional neural networks (CNN) for the diagnosis of glaucoma in screening campaigns using color fundus images. - + Methods : Two independent data sets were used to develop and evaluate the proposed method. 1) 805 color fundus images with a field of view of 45 degrees, centered on the macula and including the optic disc (OD) from patients with age ranging from 55 to 86 years old included in a glaucoma detection campaign performed at Hospital Esperanza (Barcelona). Annotations were performed by eight observers having 8 to 26 years of clinical experience. 2) 101 images from the publicly available Drishti-GS retinal image dataset (http://cvit.iiit.ac.in/projects/mip/drishti-gs/mip-dataset2/Home.php). The total 906 images were further organized into a training, monitoring and test set according to a 60-20-20 split. The process to train and validate the CNN had 3 steps. 1) Preprocessing: the edges and the background were blurred to reduce the effect of the bright fringe and the border. Then patches centered at the OD of size 256x256x3 pixels were automatically segmented and scaled to values from 0 to 1. 2) Implementation: The architecture consisted of ten convolutional layers (32 filters 3x3 pixels size) followed by rectified linear units and spatial max-pooling. The network ends with a fully connected layer and a soft-max classifier which outputs a score from 0 to 1. The network was trained using stochastic gradient descent and a learning rate of 0.005. To avoid overfitting data augmentation was performed applying randomly translations, flipping and rotations during the training, and dropout with probability of 0.5. 3) Monitoring and evaluation: the training was completed after 50 epochs. To evaluate the classification capabilities of the algorithm, the area under the receiver operating characteristic curve (ROC) was calculated using the training set. - + Results : An automatic classification algorithm based on CNN was developed. The present method achieved an area under the ROC of 0.894. The accuracy to identify healthy and glaucoma cases was 0.884 and 0.781 respectively, using a threshold of 0.5. - + Conclusions : The good performance of the proposed CNN architecture suggests potential usefulness of these methods for an initial automatic classification of images in screening campaigns for glaucoma.}, optnote = {DIAG, RADIOLOGY}, gsid = {11583482517657678688}, @@ -9800,11 +9800,11 @@ @conference{Gonz19a url = {https://iovs.arvojournals.org/article.aspx?articleid=2746850}, title = {Opening the "black box" of deep learning in automated screening of eye diseases}, abstract = {Purpose: Systems based on deep learning (DL) have demonstrated to provide a scalable and high-performance solution for screening of eye diseases. However, DL is usually considered a "black box? due to lack of interpretability. We propose a deep visualization framework to explain the decisions made by a DL system, iteratively unveiling abnormalities responsible for referable predictions without needing lesion-level annotations. We apply the framework to automated screening of diabetic retinopathy (DR) in color fundus images (CFIs). - + Methods: The proposed framework consists of a baseline deep convolutional neural network to classify CFIs by DR stage. For each CFI classified as referable DR, the framework extracts initial visual evidence of the predicted stage by computing a saliency map, which indicates regions in the image that would contribute the most to changes in the prediction if modified. This provides localization of abnormalities that are then removed through selective inpainting. The image is again classified, expecting reduced referability. We iteratively apply this procedure to increase attention to less discriminative areas and generate refined visual evidence. The Kaggle DR database, with CFIs graded regarding DR severity (stages 0 and 1: non-referable DR, stages 2 to 4: referable DR), is used for training and validation of the image-level classification task. For validation of the obtained visual evidence, we used the DiaretDB1 dataset, which contains CFIs with manually-delineated areas for 4 types of lesions: hemorrhages, microaneurysms, hard and soft exudates. - + Results: The baseline classifier obtained an area under the Receiver Operating Characteristic (ROC) curve of 0.93 and a quadratic weighted kappa of 0.77 on the Kaggle test set (53576 CFIs). Free-response ROC (FROC) curves (Figure 2) analyze the correspondence between highlighted areas and each type of lesion for those images classified as referable DR in the DiaretDB1 dataset (62 CFIs), comparing between initial and refined visual evidence. - + Conclusions : The proposed framework provides visual evidence for the decisions made by a DL system, iteratively unveiling abnormalities in CFIs based on the prediction of a classifier trained only with image-level labels. This provides a "key? to open the "black box? of artificial intelligence in screening of eye diseases, aiming to increase experts' trust and facilitate its integration in screening settings.}, optnote = {DIAG, RADIOLOGY}, year = {2019}, @@ -9886,11 +9886,11 @@ @conference{Gonz21 url = {https://iovs.arvojournals.org/article.aspx?articleid=2773295}, title = {Hierarchical curriculum learning for robust automated detection of low-prevalence retinal disease features: application to reticular pseudodrusen}, abstract = {Purpose: The low prevalence of certain retinal disease features compromises data collection for deep neural networks (DNN) development and, consequently, the benefits of automated detection. We robustify the detection of such features in scarce data settings by exploiting hierarchical information available in the data to learn from generic to specific, low-prevalence features. We focus on reticular pseudodrusen (RPD), a hallmark of intermediate age-related macular degeneration (AMD). - + Methods: Color fundus images (CFI) from the AREDS dataset were used for DNN development (106,994 CFI) and testing (27,066 CFI). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. In both datasets CFI were graded from generic to specific features. This allows to establish a hierarchy of binary classification tasks with decreasing prevalence: presence of AMD findings (AREDS prevalence: 88%; RS1-6: 77%), drusen (85%; 73%), large drusen (40%; 24%), RPD (1%; 4%). We created a hierarchical curriculum and developed a DNN (HC-DNN) that learned each task sequentially. We computed its performance for RPD detection in both test sets and compared it to a baseline DNN (B-DNN) that learned to detect RPD from scratch disregarding hierarchical information. We studied their robustness across datasets, while reducing the size of data available for development (same prevalences) - + Results: Area under the receiver operating characteristic curve (AUC) was used to measure RPD detection performance. When large development data were available, there was no significant difference between DNNs (100% data, HC-DNN: 0.96 (95% CI, 0.94-0.97) in AREDS, 0.82 (0.78-0.86) in RS1-6; B-DNN: 0.95 (0.94-0.96) in AREDS, 0.83 (0.79-0.87) in RS1-6). However, HC-DNN achieved better performance and robustness across datasets when development data were highly reduced (<50% data, p-values<0.05) (1% data, HC-DNN: 0.63 (0.60-0.66) in AREDS, 0.76 (0.72-0.80) in RS1-6; B-DNN: 0.53 (0.49-0.56) in AREDS, 0.48 (0.42-0.53) in RS1-6). - + Conclusions: Hierarchical curriculum learning allows for knowledge transfer from general, higher-prevalence features and becomes beneficial for the detection of low-prevalence retinal features, such as RPD, in scarce data settings. Moreover, exploiting hierarchical information improves DNN robustness across datasets.}, optnote = {DIAG, RADIOLOGY}, year = {2021}, @@ -9902,13 +9902,13 @@ @conference{Gonz21a title = {Deep learning for automated stratification of ophthalmic images: Application to age-related macular degeneration and color fundus images}, url = {https://euretina.org/resource/abstract_2021_deep-learning-for-automated-stratification-of-ophthalmic-images-application-to-age-related-macular-degeneration-and-color-fundus-images/}, abstract = {Purpose: Deep learning (DL) systems based on convolutional neural networks (CNNs) have achieved expert-level performance in different classification tasks, and have shown the potential to reduce current experts' workload significantly. We explore this potential in the context of automated stratification of ophthalmic images. DL could accelerate the setup of clinical studies by filtering large amounts of images or patients based on specific inclusion criteria, as well as aid in patient selection for clinical trials. DL could also allow for automated categorization of entering images in busy clinical or screening settings, enhancing data triaging, searching, retrieval, and comparison. Automated stratification could also facilitate data collection and application of further DL-based phenotyping analysis, by generating useful sets of images for expert annotation, training, or testing of segmentation algorithms. In our work, we focus on the stratification of color fundus images (CFI) based on multiple features related to age-related macular degeneration (AMD) at different hierarchical levels. We further analyze the robustness of the automated stratification system when the amount of data available for development is limited. We performed our validation on two different population studies. - + Setting/Venue: Deep learning applied to ophthalmic imaging. - + Methods: Automated stratification of CFI was performed based on the presence or absence of the following AMD features, following a hierarchical tree with different branches (Bi) and levels (Hi) from generic features (H0) to specific features (H3): AMD findings (H0); B1: drusen (H1), large drusen (H2), reticular pseudodrusen (H3); B2: pigmentary changes (H1), hyperpigmentation (H2), hypopigmentation (H2); B3: late AMD (H1), geographic atrophy (H2), choroidal neovascularization (H2). The automated stratification system consisted of a set of CNNs (based on the Inception-v3 architecture) able to classify the multiple AMD features (presence/absence) at higher and lower levels. This allowed to automatically stratify incoming CFI into the hierarchical tree. CFI from the AREDS dataset were used for development (106,994 CFI) and testing (27,066 CFI) of the CNNs. We validated the robustness of the system to a gradual decrease in the amount of data available for development (100%, 75%, 50%, 25%, 10%, 5%, 2.5%, and 1% of development data). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. This allowed to validate the performance of the automated stratification across studies where different CFI grading protocols were used. - + Results: Area under the receiver operating characteristic curve (AUC) was used to measure the performance of each feature's classification within the automated stratification. The AUC averaged across AMD features when 100% of development data was available was 93.8% (95% CI, 93.4%-94.2%) in AREDS and 84.4% (82.1%-86.5%) in RS1-6. There was an average relative decrease in performance of 10.0+-4.7% between AREDS and the external test set, RS1-6. The performance of the system decreased gradually with each development data reduction. When only 1% of data was available for development, the average AUC was 81.9% (81.0%-82.8%) in AREDS and 74.0% (70.8%-77.0%) in RS1-6. This corresponded to an average relative decrease in performance of 12.7+-13.2% in AREDS and 12.6+-7.8% in RS1-6. - + Conlusions: The automated stratification system achieved overall high performance in the classification of different features independently of their hierarchical level. This shows the potential of DL systems to identify diverse phenotypes and to obtain an accurate automated stratification of CFI. The results showed that automated stratification was also robust to a dramatic reduction in the data available for development, maintaining the average AUC above 80%. This is a positive observation, considering that the amount of data available for DL development can be limited in some settings, and the gradings can be costly to obtain. Nevertheless, variability in performance across features could be observed, especially for those with very low prevalence, such as reticular pseudodrusen, where performance became more unstable when few data were available. The external validation showed these observations held when the automated stratification was applied in a different population study, with an expected (but not drastic) drop of performance due to differences between datasets and their grading protocols. In conclusion, our work supports that DL is a powerful tool for the filtering and stratification of ophthalmic images, and has the potential to reduce the workload of experts while supporting them in research and clinical settings.}, optnote = {DIAG, RADIOLOGY}, year = {2021}, @@ -9920,11 +9920,11 @@ @conference{Gonz21b url = {https://journals.sagepub.com/doi/full/10.1177/11206721211047031}, title = {Trustworthy AI: closing the gap between development and integration of AI in Ophthalmology}, abstract = {Design: Descriptive study. - + Purpose: To identify the main aspects that currently complicate the integration of artificial intelligence (AI) in ophthalmic settings. - + Methods: Based on an extensive review of state-of-the-art literature of AI applied to Ophthalmology plus interviews with multidisciplinary, international experts, we identified the most relevant aspects to consider during AI design to generate trustworthy (i.e., transparent, robust, and sustainable) AI systems and, consequently, facilitate a subsequent successful integration in real-world ophthalmic settings. - + Results: Several essential aspects to consider were identified: 1) The reliability of the human annotations that are used for establishing the reference standard an AI system learns from, or for setting robust observer studies that allow for fair human-AI performance comparison. 2) The ability of an AI system to generalize across populations, ophthalmic settings, and data acquisition protocols in order to avoid the negative consequences of algorithmic bias and lack of domain adaptation. @@ -9932,7 +9932,7 @@ @conference{Gonz21b 4) The importance of providing interpretable AI-based predictions to open the "black box" and increase trust and clinical usability. 5) A plan to monitor the impact of AI on the clinical workflow, i.e., the adaptation of healthcare providers and patients to the new technology, human-AI interaction, cost-benefit analyses... 6) The necessity to update current regulations to accelerate and control AI integration and all related aspects, such as patient privacy, systems' updates, and liability. - + Conclusions: It is important that healthcare providers in Ophthalmology consider these aspects and their consequences when thinking of AI in practice. It is key that all involved stakeholders collaborate and interact from the beginning of the AI design process to ensure a good alignment with real-world clinical needs and settings. This way, it will be possible to generate trustworthy AI solutions and close the gap between development and deployment, so that the AI benefits currently shown on paper reach the final users.}, optnote = {DIAG, RADIOLOGY}, year = {2021}, @@ -10679,12 +10679,12 @@ @conference{Hadd19 year = {2019}, abstract = {Background and Objective: A three-dimensional visualization of a human carcinoma could provide invaluable diagnostic information and redefine how we perceive and analyze cancer invasion. As deep learning begins automating the diagnostic workflow and cutting-edge microcopy provides unprecedented ways of visualizing tissue, combining these methologies could provide novel insight into malignant tumors and other pathologic entities. By combining Knife-Edge Scanning Microscopy with convolutional neural networks, we set out to visualize an entire threedimensional colorectal carcinoma segmented into specific tissue classifications. - + Methods: A Knife-Edge Scanning Microscope (KESM), developed by Strateos (San Francisco, CA, USA), was used to digitize a whole-mount, H&E stained, formalinfixed paraffin-embedded human tissue specimen obtained from the Radboudumc (Nijmegen, Netherlands). Sparse manual annotations of 5 tissue types (tumor, stroma, muscle, healthy glands, background) were provided using KESM data to train a convolutional neural network developed by the Computational Pathology Group (Radboudumc) for semantic segmentation of the colorectal carcinoma tissue. The three-dimensional visualization was generated using 3Scan's proprietary visualization pipeline. - + Results: The convolutional neural network was used to process roughly 1200 slices of KESM data. The stitched and rendered segmentation maps demonstrate the formalin-fixed paraffin-embedded carcinoma of roughly 5 millimeters in depth. As shown in the figure, the tumor invasive margin can be seen advancing into the surrounding tumor stroma. - + Conclusion: Based on our findings, we were capable of training a segmentation model on the 3D KESM data to create an accurate representation of a formalin-fixed paraffin-embedded colorectal carcinoma tissue block segmented into five tissue classifications. Going forward, this can have much broader implications on the research and understanding of invasive tumors.}, optnote = {DIAG, RADIOLOGY}, } @@ -10724,9 +10724,9 @@ @article{Hadj22 year = {2022}, doi = {https://doi.org/10.1002/mp.16188}, abstract = {Rapid advances in artificial intelligence (AI) and machine learning, and specifically in deep learning (DL) techniques, have enabled broad application of these methods in health care. The promise of the DL approach has spurred further interest in computer-aided diagnosis (CAD) development and applications using both "traditional" machine learning methods and newer DL-based methods. We use the term CAD-AI to refer to this expanded clinical decision support environment that uses traditional and DL-based AI methods. - + Numerous studies have been published to date on the development of machine learning tools for computer-aided, or AI-assisted, clinical tasks. However, most of these machine learning models are not ready for clinical deployment. It is of paramount importance to ensure that a clinical decision support tool undergoes proper training and rigorous validation of its generalizability and robustness before adoption for patient care in the clinic. - + To address these important issues, the American Association of Physicists in Medicine (AAPM) Computer-Aided Image Analysis Subcommittee (CADSC) is charged, in part, to develop recommendations on practices and standards for the development and performance assessment of computer-aided decision support systems. The committee has previously published two opinion papers on the evaluation of CAD systems and issues associated with user training and quality assurance of these systems in the clinic. With machine learning techniques continuing to evolve and CAD applications expanding to new stages of the patient care process, the current task group report considers the broader issues common to the development of most, if not all, CAD-AI applications and their translation from the bench to the clinic. The goal is to bring attention to the proper training and validation of machine learning algorithms that may improve their generalizability and reliability and accelerate the adoption of CAD-AI systems for clinical decision support.}, file = {Hadj22.pdf:pdf\\Hadj22.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -10966,19 +10966,19 @@ @article{Harl21 doi = {10.1093/rheumatology/keab835}, year = {2021}, abstract = {Abstract - + Objectives Earlier retrospective studies have suggested a relation between DISH and cardiovascular disease, including myocardial infarction. The present study assessed the association between DISH and incidence of cardiovascular events and mortality in patients with high cardiovascular risk. - - + + Methods In this prospective cohort study, we included 4624 patients (mean age 58.4 years, 69.6% male) from the Second Manifestations of ARTerial disease cohort. The main end point was major cardiovascular events (MACE: stroke, myocardial infarction and vascular death). Secondary endpoints included all-cause mortality and separate vascular events. Cause-specific proportional hazard models were used to evaluate the risk of DISH on all outcomes, and subdistribution hazard models were used to evaluate the effect of DISH on the cumulative incidence. All models were adjusted for age, sex, body mass index, blood pressure, diabetes, non-HDL cholesterol, packyears, renal function and C-reactive protein. - - + + Results DISH was present in 435 (9.4%) patients. After a median follow-up of 8.7 (IQR 5.0-12.0) years, 864 patients had died and 728 patients developed a MACE event. DISH was associated with an increased cumulative incidence of ischaemic stroke. After adjustment in cause-specific modelling, DISH remained significantly associated with ischaemic stroke (HR 1.55; 95% CI: 1.01, 2.38), but not with MACE (HR 0.99; 95% CI: 0.79, 1.24), myocardial infarction (HR 0.88; 95% CI: 0.59, 1.31), vascular death (HR 0.94; 95% CI: 0.68, 1.27) or all-cause mortality (HR 0.94; 95% CI: 0.77, 1.16). - - + + Conclusion The presence of DISH is independently associated with an increased incidence and risk for ischaemic stroke, but not with MACE, myocardial infarction, vascular death or all-cause mortality. }, @@ -11236,11 +11236,11 @@ @article{Hend23 doi = {https://doi.org/10.1007/s00330-022-09205-4}, url = {https://link.springer.com/article/10.1007/s00330-022-09205-4}, abstract = {Objectives: To assess how an artificial intelligence (AI) algorithm performs against five experienced musculoskeletal radiologists in diagnosing scaphoid fractures and whether it aids their diagnosis on conventional multi-view radiographs. - + Methods: Four datasets of conventional hand, wrist, and scaphoid radiographs were retrospectively acquired at two hospitals (hospitals A and B). Dataset 1 (12,990 radiographs from 3353 patients, hospital A) and dataset 2 (1117 radiographs from 394 patients, hospital B) were used for training and testing a scaphoid localization and laterality classification component. Dataset 3 (4316 radiographs from 840 patients, hospital A) and dataset 4 (688 radiographs from 209 patients, hospital B) were used for training and testing the fracture detector. The algorithm was compared with the radiologists in an observer study. Evaluation metrics included sensitivity, specificity, positive predictive value (PPV), area under the characteristic operating curve (AUC), Cohen's kappa coefficient (k), fracture localization precision, and reading time. - + Results: The algorithm detected scaphoid fractures with a sensitivity of 72%, specificity of 93%, PPV of 81%, and AUC of 0.88. The AUC of the algorithm did not differ from each radiologist (0.87 [radiologists' mean], p >=.05). AI assistance improved five out of ten pairs of inter-observer Cohen's k agreements (p <.05) and reduced reading time in four radiologists (p <.001), but did not improve other metrics in the majority of radiologists (p >=.05). - + Conclusions: The AI algorithm detects scaphoid fractures on conventional multi-view radiographs at the level of five experienced musculoskeletal radiologists and could significantly shorten their reading time.}, file = {Hend23.pdf:pdf\\Hend23.pdf:PDF}, journal = ER, @@ -11261,16 +11261,16 @@ @article{Hend23a url = {https://doi.org/10.1007/s00330-023-09826-3}, abstract = {Objective To study trends in the incidence of reported pulmonary nodules and stage I lung cancer in chest CT. - + Methods We analyzed the trends in the incidence of detected pulmonary nodules and stage I lung cancer in chest CT scans in the period between 2008 and 2019. Imaging metadata and radiology reports from all chest CT studies were collected from two large Dutch hospitals. A natural language processing algorithm was developed to identify studies with any reported pulmonary nodule. - + Results Between 2008 and 2019, a total of 74,803 patients underwent 166,688 chest CT examinations at both hospitals combined. During this period, the annual number of chest CT scans increased from 9955 scans in 6845 patients in 2008 to 20,476 scans in 13,286 patients in 2019. The proportion of patients in whom nodules (old or new) were reported increased from 38% (2595/6845) in 2008 to 50% (6654/13,286) in 2019. The proportion of patients in whom significant new nodules (>= 5 mm) were reported increased from 9% (608/6954) in 2010 to 17% (1660/9883) in 2017. The number of patients with new nodules and corresponding stage I lung cancer diagnosis tripled and their proportion doubled, from 0.4% (26/6954) in 2010 to 0.8% (78/9883) in 2017. - + Conclusion The identification of incidental pulmonary nodules in chest CT has steadily increased over the past decade and has been accompanied by more stage I lung cancer diagnoses. - + Clinical relevance statement These findings stress the importance of identifying and efficiently managing incidental pulmonary nodules in routine clinical practice.}, file = {Hend23a.pdf:pdf\\Hend23a.pdf:PDF}, @@ -11293,13 +11293,13 @@ @article{Hend23b abstract = {Abstract Background Outside a screening program, early-stage lung cancer is generally diagnosed after the detection of incidental nodules in clinically ordered chest CT scans. Despite the advances in artificial intelligence (AI) systems for lung cancer detection, clinical validation of these systems is lacking in a non-screening setting. - + Method We developed a deep learning-based AI system and assessed its performance for the detection of actionable benign nodules (requiring follow-up), small lung cancers, and pulmonary metastases in CT scans acquired in two Dutch hospitals (internal and external validation). A panel of five thoracic radiologists labeled all nodules, and two additional radiologists verified the nodule malignancy status and searched for any missed cancers using data from the national Netherlands Cancer Registry. The detection performance was evaluated by measuring the sensitivity at predefined false positive rates on a free receiver operating characteristic curve and was compared with the panel of radiologists. - + Results On the external test set (100 scans from 100 patients), the sensitivity of the AI system for detecting benign nodules, primary lung cancers, and metastases is respectively 94.3% (82/87, 95% CI: 88.1-98.8%), 96.9% (31/32, 95% CI: 91.7-100%), and 92.0% (104/113, 95% CI: 88.5-95.5%) at a clinically acceptable operating point of 1 false positive per scan (FP/s). These sensitivities are comparable to or higher than the radiologists, albeit with a slightly higher FP/s (average difference of 0.6). - + Conclusions The AI system reliably detects benign and malignant pulmonary nodules in clinically indicated CT scans and can potentially assist radiologists in this setting.}, citation-count = {0}, @@ -11620,9 +11620,9 @@ @article{Heuv16 pages = {241 - 251}, doi = {10.1016/j.nicl.2016.07.002}, abstract = {In this paper a Computer Aided Detection (CAD) system is presented to automatically detect Cerebral Microbleeds (CMBs) in patients with Traumatic Brain Injury (TBI). It is believed that the presence of CMBs has clinical prognostic value in TBI patients. To study the contribution of CMBs in patient outcome, accurate detection of CMBs is required. Manual detection of CMBs in TBI patients is a time consuming task that is prone to errors, because CMBs are easily overlooked and are difficult to distinguish from blood vessels. - + This study included 33 TBI patients. Because of the laborious nature of manually annotating CMBs, only one trained expert manually annotated the CMBs in all 33 patients. A subset of ten TBI patients was annotated by six experts. Our CAD system makes use of both Susceptibility Weighted Imaging (SWI) and T1 weighted magnetic resonance images to detect CMBs. After pre-processing these images, a two-step approach was used for automated detection of CMBs. In the first step, each voxel was characterized by twelve features based on the dark and spherical nature of CMBs and a random forest classifier was used to identify CMB candidate locations. In the second step, segmentations were made from each identified candidate location. Subsequently an object-based classifier was used to remove false positive detections of the voxel classifier, by considering seven object-based features that discriminate between spherical objects (CMBs) and elongated objects (blood vessels). A guided user interface was designed for fast evaluation of the CAD system result. During this process, an expert checked each CMB detected by the CAD system. - + A Fleiss' kappa value of only 0.24 showed that the inter-observer variability for the TBI patients in this study was very large. An expert using the guided user interface reached an average sensitivity of 93%, which was significantly higher (p = 0.03) than the average sensitivity of 77% (sd 12.4%) that the six experts manually detected. Furthermore, with the use of this CAD system the reading time was substantially reduced from one hour to 13 minutes per patient, because the CAD system only detects on average 25.9 false positives per TBI patient, resulting in 0.29 false positives per definite CMB finding.}, file = {Heuv16.pdf:pdf\\Heuv16.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -12638,13 +12638,13 @@ @article{Hoss21 url = {https://doi.org/10.1007/s00330-021-08320-y}, abstract = {Objectives To assess Prostate Imaging Reporting and Data System (PI-RADS)-trained deep learning (DL) algorithm performance and to investigate the effect of data size and prior knowledge on the detection of clinically significant prostate cancer (csPCa) in biopsy-naive men with a suspicion of PCa. - + Methods Multi-institution data included 2734 consecutive biopsy-naive men with elevated PSA levels (>= 3 ng/mL) that underwent multi-parametric MRI (mpMRI). mpMRI exams were prospectively reported using PI-RADS v2 by expert radiologists. A DL framework was designed and trained on center 1 data (n = 1952) to predict PI-RADS >= 4 (n = 1092) lesions from bi-parametric MRI (bpMRI). Experiments included varying the number of cases and the use of automatic zonal segmentation as a DL prior. Independent center 2 cases (n = 296) that included pathology outcome (systematic and MRI targeted biopsy) were used to compute performance for radiologists and DL. The performance of detecting PI-RADS 4-5 and Gleason > 6 lesions was assessed on 782 unseen cases (486 center 1, 296 center 2) using free-response ROC (FROC) and ROC analysis. - + Results The DL sensitivity for detecting PI-RADS >= 4 lesions was 87% (193/223, 95% CI: 82-91) at an average of 1 false positive (FP) per patient, and an AUC of 0.88 (95% CI: 0.84-0.91). The DL sensitivity for the detection of Gleason > 6 lesions was 85% (79/93, 95% CI: 77-83) @ 1 FP compared to 91% (85/93, 95% CI: 84-96) @ 0.3 FP for a consensus panel of expert radiologists. Data size and prior zonal knowledge significantly affected performance (4%, p<0.05). - + Conclusion PI-RADS-trained DL can accurately detect and localize Gleason > 6 lesions. DL could reach expert performance using substantially more than 2000 training cases, and DL zonal segmentation.}, taverne_url = {https://repository.ubn.ru.nl/handle/2066/249485}, @@ -15025,17 +15025,17 @@ @article{Kauc20 pages = {3277-3294}, doi = {10.1007/s00330-020-06727-7}, abstract = {In Europe, lung cancer ranks third among the most common cancers, remaining the biggest killer. Since the publication of the first European Society of Radiology and European Respiratory Society joint white paper on lung cancer screening (LCS) in 2015, many new findings have been published and discussions have increased considerably. Thus, this updated expert opinion represents a narrative, non-systematic review of the evidence from LCS trials and description of the current practice of LCS as well as aspects that have not received adequate attention until now. Reaching out to the potential participants (persons at high risk), optimal communication and shared decision-making will be key starting points. Furthermore, standards for infrastructure, pathways and quality assurance are pivotal, including promoting tobacco cessation, benefits and harms, overdiagnosis, quality, minimum radiation exposure, definition of management of positive screen results and incidental findings linked to respective actions as well as cost-effectiveness. This requires a multidisciplinary team with experts from pulmonology and radiology as well as thoracic oncologists, thoracic surgeons, pathologists, family doctors, patient representatives and others. The ESR and ERS agree that Europe's health systems need to adapt to allow citizens to benefit from organised pathways, rather than unsupervised initiatives, to allow early diagnosis of lung cancer and reduce the mortality rate. Now is the time to set up and conduct demonstration programmes focusing, among other points, on methodology, standardisation, tobacco cessation, education on healthy lifestyle, cost-effectiveness and a central registry. - + Key Points - + * Pulmonologists and radiologists both have key roles in the set up of multidisciplinary LCS teams with experts from many other fields. - + * Pulmonologists identify people eligible for LCS, reach out to family doctors, share the decision-making process and promote tobacco cessation. - + * Radiologists ensure appropriate image quality, minimum dose and a standardised reading/reporting algorithm, together with a clear definition of a "positive screen". - + * Strict algorithms define the exact management of screen-detected nodules and incidental findings. - + * For LCS to be (cost-)effective, it has to target a population defined by risk prediction models.}, file = {Kauc20.pdf:pdf\\Kauc20.pdf:PDF}, optnote = {DIAG, INPRESS, RADIOLOGY}, @@ -15104,16 +15104,16 @@ @article{Kemp21 abstract = {Abstract Objectives Different machine learning algorithms (MLAs) for automated segmentation of gliomas have been reported in the literature. Automated segmentation of different tumor characteristics can be of added value for the diagnostic work-up and treatment planning. The purpose of this study was to provide an overview and meta-analysis of different MLA methods. - + Methods A systematic literature review and meta-analysis was performed on the eligible studies describing the segmentation of gliomas. Meta-analysis of the performance was conducted on the reported dice similarity coefficient (DSC) score of both the aggregated results as two subgroups (i.e., high-grade and low-grade gliomas). This study was registered in PROSPERO prior to initiation (CRD42020191033). - + Results After the literature search (n = 734), 42 studies were included in the systematic literature review. Ten studies were eligible for inclusion in the meta-analysis. Overall, the MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86). In addition, a DSC score of 0.83 (95% CI: 0.80-0.87) and 0.82 (95% CI: 0.78-0.87) was observed for the automated glioma segmentation of the high-grade and low-grade gliomas, respectively. However, heterogeneity was considerably high between included studies, and publication bias was observed. - + Conclusion MLAs facilitating automated segmentation of gliomas show good accuracy, which is promising for future implementation in neuroradiology. However, before actual implementation, a few hurdles are yet to be overcome. It is crucial that quality guidelines are followed when reporting on MLAs, which includes validation on an external test set. - + Key Points * MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86), indicating a good performance. * MLA performance was comparable when comparing the segmentation results of the high-grade gliomas and the low-grade gliomas. @@ -15140,16 +15140,16 @@ @article{Kers21 abstract = {Abstract Objectives To evaluate if artificial intelligence (AI) can discriminate recalled benign from recalled malignant mammographic screening abnormalities to improve screening performance. - + Methods A total of 2257 full-field digital mammography screening examinations, obtained 2011-2013, of women aged 50-69 years which were recalled for further assessment of 295 malignant out of 305 truly malignant lesions and 2289 benign lesions after independent double-reading with arbitration, were included in this retrospective study. A deep learning AI system was used to obtain a score (0-95) for each recalled lesion, representing the likelihood of breast cancer. The sensitivity on the lesion level and the proportion of women without false-positive ratings (non-FPR) resulting under AI were estimated as a function of the classification cutoff and compared to that of human readers. - + Results Using a cutoff of 1, AI decreased the proportion of women with false-positives from 89.9 to 62.0%, non-FPR 11.1% vs. 38.0% (difference 26.9%, 95% confidence interval 25.1-28.8%; p < .001), preventing 30.1% of reader-induced false-positive recalls, while reducing sensitivity from 96.7 to 91.1% (5.6%, 3.1-8.0%) as compared to human reading. The positive predictive value of recall (PPV-1) increased from 12.8 to 16.5% (3.7%, 3.5-4.0%). In women with mass-related lesions (n = 900), the non-FPR was 14.2% for humans vs. 36.7% for AI (22.4%, 19.8-25.3%) at a sensitivity of 98.5% vs. 97.1% (1.5%, 0-3.5%). - + Conclusion The application of AI during consensus conference might especially help readers to reduce false-positive recalls of masses at the expense of a small sensitivity reduction. Prospective studies are needed to further evaluate the screening benefit of AI in practice. - + Key Points * Integrating the use of artificial intelligence in the arbitration process reduces benign recalls and increases the positive predictive value of recall at the expense of some sensitivity loss. * Application of the artificial intelligence system to aid the decision to recall a woman seems particularly beneficial for masses, where the system reaches comparable sensitivity to that of the readers, but with considerably reduced false-positives. @@ -15649,7 +15649,7 @@ @article{Kooi17d pages = {International Society for Optics and Photonics}, doi = {10.1117/1.JMI.4.4.044501}, abstract = {Neural networks, in particular deep Convolutional Neural Networks (CNN), have recently gone through a renaissance sparked by the introduction of more efficient training procedures and massive amounts of raw annotated data. Barring a handful of modalities, medical images are typically too large to present as input as a whole and models are consequently trained with subsets of images or cases, representing the most crucial bits of information. When inspecting a scene to identify objects, humans take cues from not just the article in question but also the elements in its vicinity: a frisbee is more likely to be a plate in the presence of a fork and knife. Similar principles apply to the analysis of medical images: specialists base their judgment of an abnormality on all available data, harnessing information such as symmetrical differences in or between organs in question and temporal change, if multiple recordings are available. \\ - + In this paper we investigate the addition of symmetry and temporal context information to a deep CNN with the purpose of detecting malignant soft tissue lesions in mammography. We employ a simple linear mapping that takes the location of a mass candidate and maps it to either the contra-lateral or prior mammogram and Regions Of Interest (ROI) are extracted around each location. We subsequently explore two different architectures (1) a fusion model employing two datastreams were both ROIs are fed to the network during training and testing and (2) a stage-wise approach where a single ROI CNN is trained on the primary image and subsequently used as feature extractor for both primary and symmetrical or prior ROIs. A 'shallow' Gradient Boosted Tree (GBT) classifier is then trained on the concatenation of these features and used to classify the joint representation. Results shown a significant increase in performance using the first architecture and symmetry information, but only marginal gains in performance using temporal data and the other setting. We feel results are promising and can greatly be improved when more temporal data becomes available.}, file = {Kooi17d.pdf:pdf\\Kooi17d.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -16404,7 +16404,7 @@ @article{Leem19b doi = {10.21105/joss.01576}, code = {https://github.com/silvandeleemput/memcnn}, abstract = {Neural networks are computational models that were originally inspired by biological neural networks like animal brains. These networks are composed of many small computational units called neurons that perform elementary calculations. Instead of explicitly programming the behavior of neural networks, these models can be trained to perform tasks, like classifying images, by presenting them examples. Sufficiently complex neural networks can automatically extract task-relevant characteristics from the presented examples without having prior knowledge about the task domain, which makes them attractive for many complicated real-world applications. - + Reversible operations have recently been successfully applied to classification problems to reduce memory requirements during neural network training. This feature is accomplished by removing the need to store the input activation for computing the gradients at the backward pass and instead reconstruct them on demand. However, current approaches rely on custom implementations of backpropagation, which limits applicability and extendibility. We present MemCNN, a novel PyTorch framework that simplifies the application of reversible functions by removing the need for a customized backpropagation. The framework contains a set of practical generalized tools, which can wrap common operations like convolutions and batch normalization and which take care of memory management. We validate the presented framework by reproducing state-of-the-art experiments using MemCNN and by comparing classification accuracy and training time on Cifar-10 and Cifar-100. Our MemCNN implementations achieved similar classification accuracy and faster training times while retaining compatibility with the default backpropagation facilities of PyTorch.}, file = {:pdf/Leem19b.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -16641,19 +16641,19 @@ @article{Leeu23a abstract = {Abstract Objectives To map the clinical use of CE-marked artificial intelligence (AI)-based software in radiology departments in the Netherlands (n = 69) between 2020 and 2022. - + Materials and methods Our AI network (one radiologist or AI representative per Dutch hospital organization) received a questionnaire each spring from 2020 to 2022 about AI product usage, financing, and obstacles to adoption. Products that were not listed on www.AIforRadiology.com by July 2022 were excluded from the analysis. - + Results The number of respondents was 43 in 2020, 36 in 2021, and 33 in 2022. The number of departments using AI has been growing steadily (2020: 14, 2021: 19, 2022: 23). The diversity (2020: 7, 2021: 18, 2022: 34) and the number of total implementations (2020: 19, 2021: 38, 2022: 68) has rapidly increased. Seven implementations were discontinued in 2022. Four hospital organizations said to use an AI platform or marketplace for the deployment of AI solutions. AI is mostly used to support chest CT (17), neuro CT (17), and musculoskeletal radiograph (12) analysis. The budget for AI was reserved in 13 of the responding centers in both 2021 and 2022. The most important obstacles to the adoption of AI remained costs and IT integration. Of the respondents, 28% stated that the implemented AI products realized health improvement and 32% assumed both health improvement and cost savings. - + Conclusion The adoption of AI products in radiology departments in the Netherlands is showing common signs of a developing market. The major obstacles to reaching widespread adoption are a lack of financial resources and IT integration difficulties. - + Clinical relevance statement The clinical impact of AI starts with its adoption in daily clinical practice. Increased transparency around AI products being adopted, implementation obstacles, and impact may inspire increased collaboration and improved decision-making around the implementation and financing of AI products. - + Key Points The adoption of artificial intelligence products for radiology has steadily increased since 2020 to at least a third of the centers using AI in clinical practice in the Netherlands in 2022. The main areas in which artificial intelligence products are used are lung nodule detection on CT, aided stroke diagnosis, and bone age prediction. @@ -16721,11 +16721,11 @@ @article{Leij17 pages = {1569-1577}, doi = {10.1212/WNL.0000000000004490}, abstract = {Objective: To investigate the temporal dynamics of cerebral small vessel disease (SVD) by 3 consecutive assessments over a period of 9 years, distinguishing progression from regression. - + Methods: Changes in SVD markers of 276 participants of the Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort (RUN DMC) cohort were assessed at 3 time points over 9 years. We assessed white matter hyperintensities (WMH) volume by semiautomatic segmentation and rated lacunes and microbleeds manually. We categorized baseline WMH severity as mild, moderate, or severe according to the modified Fazekas scale. We performed mixed-effects regression analysis including a quadratic term for increasing age. - + Results: Mean WMH progression over 9 years was 4.7 mL (0.54 mL/y; interquartile range 0.95-5.5 mL), 20.3% of patients had incident lacunes (2.3%/y), and 18.9% had incident microbleeds (2.2%/y). WMH volume declined in 9.4% of the participants during the first follow-up interval, but only for 1 participant (0.4%) throughout the whole follow-up. Lacunes disappeared in 3.6% and microbleeds in 5.7% of the participants. WMH progression accelerated over time: including a quadratic term for increasing age during follow-up significantly improved the model (p < 0.001). SVD progression was predominantly seen in participants with moderate to severe WMH at baseline compared to those with mild WMH (odds ratio [OR] 35.5, 95% confidence interval [CI] 15.8-80.0, p < 0.001 for WMH progression; OR 5.7, 95% CI 2.8-11.2, p < 0.001 for incident lacunes; and OR 2.9, 95% CI 1.4-5.9, p = 0.003 for incident microbleeds). - + Conclusions: SVD progression is nonlinear, accelerating over time, and a highly dynamic process, with progression interrupted by reduction in some, in a population that on average shows progression.}, file = {Leij17.pdf:pdf\\Leij17.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -16764,16 +16764,16 @@ @article{Leij18a abstract = { Background and Purpose-- White matter hyperintensities (WMH) are frequently seen on neuroimaging of elderly and are associated with cognitive decline and the development of dementia. Yet, the temporal dynamics of conversion of normal-appearing white matter (NAWM) into WMH remains unknown. We examined whether and when progression of WMH was preceded by changes in fluid-attenuated inversion recovery and diffusion tensor imaging values, thereby taking into account differences between participants with mild versus severe baseline WMH. - - + + Methods-- From 266 participants of the RUN DMC study (Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort), we semiautomatically segmented WMH at 3 time points for 9 years. Images were registered to standard space through a subject template. We analyzed differences in baseline fluid-attenuated inversion recovery, fractional anisotropy, and mean diffusivity (MD) values and changes in MD values over time between 4 regions: (1) remaining NAWM, (2) NAWM converting into WMH in the second follow-up period, (3) NAWM converting into WMH in the first follow-up period, and (4) WMH. - - + + Results-- NAWM converting into WMH in the first or second time interval showed higher fluid-attenuated inversion recovery and MD values than remaining NAWM. MD values in NAWM converting into WMH in the first time interval were similar to MD values in WMH. When stratified by baseline WMH severity, participants with severe WMH had higher fluid-attenuated inversion recovery and MD and lower fractional anisotropy values than participants with mild WMH, in all areas including the NAWM. MD values in WMH and in NAWM that converted into WMH continuously increased over time. - - + + Conclusions-- Impaired microstructural integrity preceded conversion into WMH and continuously declined over time, suggesting a continuous disease process of white matter integrity loss that can be detected using diffusion tensor imaging even years before WMH become visible on conventional neuroimaging. Differences in microstructural integrity between participants with mild versus severe WMH suggest heterogeneity of both NAWM and WMH, which might explain the clinical variability observed in patients with similar small vessel disease severity. }, @@ -16976,11 +16976,11 @@ @inproceedings{Less16 pages = {978511-1 -- 978511-6}, doi = {10.1117/12.2216978}, abstract = {The amount of calcifications in the coronary arteries is a powerful and independent predictor of cardiovascular events and is used to identify subjects at high risk who might benefit from preventive treatment. Routine quantification of coronary calcium scores can complement screening programs using low-dose chest CT, such as lung cancer screening. We present a system for automatic coronary calcium scoring based on deep convolutional neural networks (CNNs). - + The system uses three independently trained CNNs to estimate a bounding box around the heart. In this region of interest, connected components above 130 HU are considered candidates for coronary artery calcifications. To separate them from other high intensity lesions, classification of all extracted voxels is performed by feeding two-dimensional 50 mm x 50 mm patches from three orthogonal planes into three concurrent CNNs. The networks consist of three convolutional layers and one fully-connected layer with 256 neurons. - + In the experiments, 1028 non-contrast-enhanced and non-ECG-triggered low-dose chest CT scans were used. The network was trained on 797 scans. In the remaining 231 test scans, the method detected on average 194.3 mm3 of 199.8 mm3 coronary calcifications per scan (sensitivity 97.2%) with an average false-positive volume of 10.3 mm3. Subjects were assigned to one of five standard cardiovascular risk categories based on the Agatston score. Accuracy of risk category assignment was 84.4% with a linearly weighted kappa of 0.89. - + The proposed system can perform automatic coronary artery calcium scoring to identify subjects undergoing low-dose chest CT screening who are at risk of cardiovascular events with high accuracy.}, file = {Less16.pdf:pdf\\Less16.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -17263,17 +17263,17 @@ @conference{Lief16 abstract = {Purpose : A tool for grading micro-aneurysms and the Foveal Avascular Zone (FAZ) in Fluorescein Angiography (FA) and OCT Angiography (OCTA) has been developed. With this tool the user can compare visibility and grade micro-aneurysms by displaying early FA, late FA and inner, intermediate and outer OCTA images in a synchronized view. - + Methods : The user can register the images in two steps by clicking on corresponding landmarks: early and late FA should be registered, as well as early FA to OCTA. A least-squares approximation to the affine transform that best matches the annotated point sets is calculated. Visual feedback is available during this stage by blending the images that need to be registered. Once the images are registered, a synchronized cursor helps the user in finding and comparing micro-aneurysms in all five images. The FAZ, for which the area is automatically calculated, can be drawn onto each image as well. - + Results : - + Early and late FA and OCTA images, segmented into an inner, intermediate and outer layer, have been acquired for 31 eyes of 24 patients with Diabetic Macular Edema (DME). In every set of images, enough landmarks could be found for successful registration. The affine transform was sufficiently accurate to compare micro-aneurysms in the different images. The tool has been used for grading visibility and leakage of 567 micro-aneurysms. The FAZ could be delineated accurately in each image except the late FA where it was not visible. - + Conclusion : We developed a tool that can help researchers in comparing properties of FA and OCTA images, by registration of 5 different images (early and late FA, inner, intermediate and outer OCTA). The tool has been used for grading micro-aneurysms and delineating the FAZ for patients with DME.}, @@ -17304,14 +17304,14 @@ @conference{Lief17a booktitle = ARVO, title = {Automatic detection of the foveal center in optical coherence tomography}, abstract = {Purpose : To aautomatically detect the foveal center in optical coherence tomography (OCT) scans in order to obtain an accurate and reliable reference for the assessment of various structural biomarkers, even in the presence of large abnormalities and across different scanning protocols. - + Methods : 1784 OCT scans were used for the development of the proposed automatic method: 1744 scans from the European Genetic Database (EUGENDA) acquired with a Heidelberg Spectralis HRA+OCT 1 scanner and 40 scans from a publicly available dataset [1] acquired with a Bioptigen scanner. Two independent sets, with different levels of age-related macular degeneration (AMD) were drawn from the same databases for evaluation: 100 scans from EUGENDA (Set A, 25 control patients and 25 for each of the AMD severity levels early, intermediate and advanced) and 100 scans from [1] (Set B, 50 control, 50 AMD). A fully convolutional neural network based on stacked layers of dilated convolutions was trained to classify each pixel in a B-scan by assigning a probability of belonging to the fovea. The network was applied to every B-scan in the OCT volume, and the final foveal center was defined as the pixel with maximum assigned probability. An initial network was trained on the 1744 training scans from EUGENDA and optimized with the 40 training scans acquired with the Bioptigen scanner, to specialize for different levels of noise and contrast. - + For all scans manual annotations were available as reference for evaluation. The foveal center was considered correctly identified if the distance between the prediction and the reference was less than the foveal radius, i.e. 750 mm. - + Results : The foveal center was correctly detected in 95 OCT scans in Set A (24 control, 24 early, 25 intermediate, 22 advanced). The mean distance error was 63.7 mm with 81 detections inside a radius of 175 mm (the foveola) and 70 inside a radius of 75 mm (the umbo). In Set B, the foveal center was correctly identified in 96 OCT scans (49 control, 47 AMD). The mean distance error was 88.6 mm with 82 detections inside the foveola and 61 inside the umbo. - + Conclusions : The proposed automatic method performed accurately for both healthy retinas and retinas affected by AMD. The method can be applied successfully to scans from different vendors, thus providing a reliable reference location for the assessment of structural biomarkers in OCT.}, optnote = {DIAG, RADIOLOGY}, year = {2017}, @@ -17365,14 +17365,14 @@ @conference{Lief19a title = {Prediction of areas at risk of developing geographic atrophy in color fundus images using deep learning}, abstract = {Purpose: Exact quantification of areas of geographic atrophy (GA) can provide an important anatomical endpoint for treatment trials. The prediction of areas where GA may develop can provide valuable personalized prognosis and help in the development of targeted treatments to prevent progression and further vision loss. In this work, we present a model based on a deep convolutional neural network (CNN) that predicts the areas of GA within 5 years from baseline using color fundus (CF) images. - + Methods: Areas of GA were delineated by 4 to 5 experienced graders in consensus in 377 CF images (252 eyes) collected from the Rotterdam Study and the Blue Mountains Eye Study. Graders made use of multimodal and follow up images when available, using our EyeNED annotation workstation. We identified 84 pairs of images (baseline and follow-up) of the same eye that were acquired with an interval of approximately 5 years. Image registration was performed by identifying corresponding landmarks between the images, allowing to project the delineated GA of the follow-up image onto the baseline image. Next, a fully automatic segmentation model, based on a deep CNN, was developed. The CNN was trained to simultaneously segment the current GA area and the area at risk of developing GA, using only the baseline image as input. A five-fold cross-validation was performed to validate the prediction performance. - + Results: The model achieved an average dice coefficient of 0.63 for segmentation of areas at risk of developing GA in the 84 images. The intraclass correlation coefficient between the GA area defined by the consensus grading of the follow-up image and the automatically predicted area based on the baseline image was 0.54. - + Conclusions: We present a model based on a deep CNN that is capable of identifying areas where GA may develop from CF images. The proposed approach constitutes a step towards personalized prognosis and possible treatment decisions. Furthermore, the model may be used for automatic discovery of new predictive biomarkers for development and growth rate of GA, and may help to automatically identify individuals at risk of developing GA.}, optnote = {DIAG, RADIOLOGY}, @@ -17401,22 +17401,22 @@ @article{Lief20 url = {https://arxiv.org/abs/1908.05621}, abstract = {PURPOSE: To develop and validate a deep learning model for the automatic segmentation of geographic atrophy (GA) using color fundus images (CFIs) and its application to study the growth rate of GA. - + DESIGN: Prospective, multicenter, natural history study with up to 15 years of follow-up. - + PARTICIPANTS: Four hundred nine CFIs of 238 eyes with GA from the Rotterdam Study (RS) and Blue Mountain Eye Study (BMES) for model development, and 3589 CFIs of 376 eyes from the Age-Related Eye Disease Study (AREDS) for analysis of GA growth rate. - + METHODS: Deep learning model based on an ensemble of encoder-decoder architectures was implemented and optimized for the segmentation of GA in CFIs. Four experienced graders delineated, in consensus, GA in CFIs from the RS and BMES. These manual delineations were used to evaluate the segmentation model using 5-fold cross-validation. The model was applied further to CFIs from the AREDS to study the growth rate of GA. Linear regression analysis was used to study associations between structural biomarkers at baseline and the GA growth rate. A general estimate of the progression of GA area over time was made by combining growth rates of all eyes with GA from the AREDS set. - + MAIN OUTCOME MEASURES: Automatically segmented GA and GA growth rate. - + RESULTS: The model obtained an average Dice coefficient of 0.72+-0.26 on the BMES and RS set while comparing the automatically segmented GA area with the graders' manual delineations. An intraclass correlation coefficient of 0.83 was reached between the automatically estimated GA area and the graders' consensus measures. Nine automatically calculated structural biomarkers (area, filled area, convex area, convex solidity, eccentricity, roundness, foveal involvement, perimeter, and circularity) were significantly associated with growth rate. Combining all growth rates indicated that GA area grows quadratically up to an area of approximately 12 mm2, after which growth rate stabilizes or decreases. - + CONCLUSIONS: The deep learning model allowed for fully automatic and robust segmentation of GA on CFIs. These segmentations can be used to extract structural characteristics of GA that predict its growth rate.}, file = {Lief20.pdf:pdf\\Lief20.pdf:PDF}, @@ -17459,16 +17459,16 @@ @article{Lief21 author = {Liefers, Bart and Taylor, Paul and Alsaedi, Abdulrahman and Bailey, Clare and Balaskas, Konstantinos and Dhingra, Narendra and Egan, Catherine A and Rodrigues, Filipa Gomes and Gonz\'{a}lez-Gonzalo, Cristina and Heeren, Tjebo F.C. and Lotery, Andrew and Muller, Philipp L. and Olvera-Barrios, Abraham and Paul, Bobby and Schwartz, Roy and Thomas, Darren S. and Warwick, Alasdair N. and Tufail, Adnan and S\'{a}nchez, Clara I.}, abstract = {Purpose: To develop and validate a deep learning model for segmentation of 13 features associated with neovascular and atrophic age-related macular degeneration (AMD). - + Design: Development and validation of a deep-learning model for feature segmentation. - + Methods: Data for model development were obtained from 307 optical coherence tomography volumes. Eight experienced graders manually delineated all abnormalities in 2,712 B-scans. A deep neural network was trained with this data to perform voxel-level segmentation of the 13 most common abnormalities (features). For evaluation, 112 B-scans from 112 patients with a diagnosis of neovascular AMD were annotated by four independent observers. Main outcome measures were Dice score, intra-class correlation coefficient (ICC), and free-response receiver operating characteristic (FROC) curve. - + Results: On 11 of the 13 features, the model obtained a mean Dice score of 0.63 +- 0.15, compared to 0.61 +- 0.17 for the observers. The mean ICC for the model was 0.66 +- 0.22, compared to 0.62 +- 0.21 for the observers. Two features were not evaluated quantitatively due to lack of data. FROC analysis demonstrated that the model scored similar or higher sensitivity per false positives compared to the observers. - + Conclusions: The quality of the automatic segmentation matches that of experienced graders for most features, exceeding human performance for some features. The quantified parameters provided by the model can be used in the current clinical routine and open possibilities for further research into treatment response outside clinical trials.}, journal = AJO, @@ -17491,7 +17491,7 @@ @phdthesis{Lief22 url = {https://repository.ubn.ru.nl/handle/2066/252875}, abstract = {This thesis is devoted to the applications of deep learning algorithms for automated analysis of retinal images. In contains chapters on: - + 1. Automatic detection of the foveal center in OCT scans (Chapter 2); 2. Segmentation of retinal layers and geographic atrophy (Chapter 3); 3. Segmentation of geographic atrophy on color fundus (Chapter 4); @@ -18235,13 +18235,13 @@ @conference{Loma23a title = {Deep learning for multi-class cell detection in H&E-stained slides of diffuse gastric cancer}, abstract = {Background & objective Diffuse gastric cancer (DGC) is characterized by poorly cohesive cells which are difficult to detect. We propose the first deep learning model to detect classical signet ring cells (SRCs), atypical SRCs, and poorly differentiated cells in H&E-stained slides of DGC. - + Methods We collected slides from 9 patients with hereditary DGC, resulting in 105 and 3 whole-slide images (WSIs) of gastric resections and biopsies, respectively. The three target cell types were annotated, resulting in 24,695 cell-level annotations. We trained a deep learning model with the Faster-RCNN architecture using 99 WSIs in the development set. - + Results The algorithm was tested on 9 WSIs in the independent validation set. Model predictions were counted as correct if they were within a 15-micron radius from the expert reference annotations. For evaluation, we split the detection task into two components: class-independent cell localization (recognition of any tumor cell type) and cell-type classification (categorizing localized cells as the correct types). We found (average) F1 scores of 0.69 and 0.93 for the localization and classification tasks, respectively. Thus, we observe that the algorithm does not generally misclassify cells, but rather, the errors mainly arise from missing cells or false positive predictions of cells that do not belong to the three target classes. - + Conclusion Future work will focus on improving the cell localization performance of the algorithm. Cell localization of the three target classes will be an important task in a clinical application of our model, in which it could be used to improve the detection of DGC lesions among large sets of slides. Moreover, the algorithm will allow for quantitative assessment of DGC patterns, potentially giving new insights in specific morphological features of DGC such as patterns of spatial cell distributions.}, optnote = {DIAG, PATHOLOGY}, @@ -19283,14 +19283,14 @@ @conference{Mann16c booktitle = RSNA, year = {2016}, abstract = {PURPOSE: White matter (WM) and gray matter (GM) respond differently to ischemia and thrombolytic treatment. Being able to differentiate WM/GM in CT enables tissue dependent perfusion analysis and automated detection of stroke related pathology. In this work we show the feasibility of segmenting WM/GM in 4DCT images of acute ischemic stroke patients. - + METHOD AND MATERIALS: In total 18 stroke patients who received both a 4DCT and followup MR scan were included in this retrospective study. CT imaging was done on a 320 row scanner with 19 or 24 volumetric acquisitions after contrast injection resulting in 512x512x320 isotropic voxels of 0.5 mm. T1w imaging was done on a 1.5T MR scanner resulting in approximately 384x318x26 voxels of 0.6x0.6x5.5 mm. The MR image was segmented with FSL tools and served as reference standard to train and evaluate the method. The method starts with brain segmentation by atlas registration followed by a refinement using a geodesic active contour with dominating advection term steered by a gradient based speed function. Within the segmented brain, three groups of features are then extracted: intensity, contextual and temporal, including a multiscale representation of the temporal average image weighted according to the exposures of the individual time points to maximize the signaltonoise ratios. In total 120 features were then fed into a nonlinear support vector machine with Gaussian radial basis kernel. A leaveonepatient out cross validation was carried out. Segmentation results were visually inspected for overall quality. Dice coefficient (DC) and 95th percentile Hausdorff distance (HD) were reported. - + RESULTS: The segmentations were evaluated as good with the separation of WM/GM at the cortex good to excellent. GM segmentation at the cortex had generally less thickness variations compared to the reference standard. DC were 0.79+-0.06 and 0.77+-0.06, 95% HD were 8.71+-3.22 and 7.11+-3.93 mm, for WM and GM, respectively. - + CONCLUSION: WM and GM segmentation in 4DCT is feasible. - - + + CLINICAL RELEVANCE/APPLICATION: WM and GM segmentation in 4DCT enables tissue dependent perfusion analysis and may increase sensitivity of detecting core and penumbra. Volume measurements of WM and GM normalized with the contralateral side may yield an important diagnostic parameter in the acute phase of ischemia.}, optnote = {DIAG, RADIOLOGY}, } @@ -19585,13 +19585,13 @@ @article{Meij15c doi = {10.1002/mrm.26024}, abstract = {Purpose There is currently controversy regarding the benefits of deconvolution-based parameters in stroke imaging, with studies suggesting a similar infarct prediction using summary parameters. We investigate here the performance of deconvolution-based parameters and summary parameters for dynamic-susceptibility contrast (DSC) MRI analysis, with particular emphasis on precision. - + Methods Numerical simulations were used to assess the contribution of noise and arterial input function (AIF) variability to measurement precision. A realistic AIF range was defined based on in vivo data from an acute stroke clinical study. The simulated tissue curves were analyzed using two popular singular value decomposition (SVD) based algorithms, as well as using summary parameters. - + Results SVD-based deconvolution methods were found to considerably reduce the AIF-dependency, but a residual AIF bias remained on the calculated parameters. Summary parameters, in turn, show a lower sensitivity to noise. The residual AIF-dependency for deconvolution methods and the large AIF-sensitivity of summary parameters was greatly reduced when normalizing them relative to normal tissue. - + Conclusion Consistent with recent studies suggesting high performance of summary parameters in infarct prediction, our results suggest that DSC-MRI analysis using properly normalized summary parameters may have advantages in terms of lower noise and AIF-sensitivity as compared to commonly used deconvolution methods.}, file = {Meij15c.pdf:pdf\\Meij15c.pdf:PDF}, @@ -19607,13 +19607,13 @@ @conference{Meij16 booktitle = RSNA, year = {2016}, abstract = {PURPOSE: Due to partial volume effects, accurate segmentation of small cerebral vessels on {CT} is a challenge. We present a novel technique that incorporates local intensity histogram information to segment the cerebral vasculature on {CT} perfusion ({CTP}) scans for suspected ischemic stroke. - + METHOD AND MATERIALS: A pattern recognition approach based on global and local image features followed by a random forest classifier is proposed. The features consist of an automatically computed brain mask denoting intracranial tissue, the first volume of the {CTP} scan, the {CTP} scan temporal average weighted according to the individual exposures to maximize signal-to-noise ratio, the weighted temporal variance ({WTV}), and local histogram features of the {WTV} calculated in a neighborhood of 9x9x9 voxels around a centered voxel. The mean, standard deviation, entropy and mode of the histogram are extracted as local feature values. In total 26 patients that underwent {CTP} for suspicion of stroke were included in this study. The {CTP} was acquired on a 320-detector row scanner. Image size was 512x512x320 voxels by 19 time points with voxel sizes of approximately 0.5 mm. Training was done on 8 patients with manually annotated data. The remaining 18 patients were used as testing set. Segmentations were visually inspected for completeness and overall quality. 3D-patches including the {M2}/{M3} segments of the middle cerebral artery were manually annotated for quantitative evaluation. The modified Hausdorff distance ({MHD}) (maximum of the median {HD}s) and the accuracy (true positive + true negative voxels divided by amount of voxels in a patch) of the segmentation were reported for the annotated patches. - + RESULTS: Overall the method was capable of segmenting the complete cerebral vasculature with inclusion of very small distal vessels. Parts of one internal carotid was missed in one patient because of clipping artefacts. In 3 patients false positive voxels were observed in the skull base region near the internal carotid artery and cavernous sinus. The {MHD} was 0.51A-A?A 1/2 0.28 mm, which is similar to the voxel spacing, and the accuracy was 0.97A-A?A 1/2 0.01. - + CONCLUSION: Our approach provides high-quality segmentation of small cerebral vessels from {CTP} data. - + CLINICAL RELEVANCE/APPLICATION: The high quality segmentation provided by our approach is an important step towards the automated localization and evaluation of vascular pathology in acute stroke patients.}, file = {Meij16.pdf:pdf\\Meij16.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -20363,13 +20363,13 @@ @article{Mert17 doi = {10.1002/mp.12077}, abstract = {PURPOSE: In breast imaging, radiological in vivo images, such as x-ray mammography and magnetic resonance imaging (MRI), are used for tumor detection, diagnosis, and size determination. After excision, the specimen is typically sliced into slabs and a small subset is sampled. Histopathological imaging of the stained samples is used as the gold standard for characterization of the tumor microenvironment. A 3D volume reconstruction of the whole specimen from the 2D slabs could facilitate bridging the gap between histology and in vivo radiological imaging. This task is challenging, however, due to the large deformation that the breast tissue undergoes after surgery and the significant undersampling of the specimen obtained in histology. In this work, we present a method to reconstruct a coherent 3D volume from 2D digital radiographs of the specimen slabs. - + METHODS: To reconstruct a 3D breast specimen volume, we propose the use of multiple target neighboring slices, when deforming each 2D slab radiograph in the volume, rather than performing pairwise registrations. The algorithm combines neighborhood slice information with free-form deformations, which enables a flexible, nonlinear deformation to be computed subject to the constraint that a coherent 3D volume is obtained. The neighborhood information provides adequate constraints, without the need for any additional regularization terms. - + RESULTS: The volume reconstruction algorithm is validated on clinical mastectomy samples using a quantitative assessment of the volume reconstruction smoothness and a comparison with a whole specimen 3D image acquired for validation before slicing. Additionally, a target registration error of 5 mm (comparable to the specimen slab thickness of 4 mm) was obtained for five cases. The error was computed using manual annotations from four observers as gold standard, with interobserver variability of 3.4 mm. Finally, we illustrate how the reconstructed volumes can be used to map histology images to a 3D specimen image of the whole sample (either MRI or CT). - + CONCLUSIONS: Qualitative and quantitative assessment has illustrated the benefit of using our proposed methodology to reconstruct a coherent specimen volume from serial slab radiographs. To our knowledge, this is the first method that has been applied to clinical breast cases, with the goal of reconstructing a whole specimen sample. The algorithm can be used as part of the pipeline of mapping histology images to ex vivo and ultimately in vivo radiological images of the breast.}, file = {Mert17.pdf:pdf\\Mert17.pdf:PDF}, @@ -22505,11 +22505,11 @@ @article{Noot22 title = {Knowledge distillation with ensembles of convolutional neural networks for medical image segmentation}, doi = {https://doi.org/10.1117/1.JMI.9.5.052407}, abstract = {Purpose: Ensembles of convolutional neural networks (CNNs) often outperform a single CNN in medical image segmentation tasks, but inference is computationally more expensive and makes ensembles unattractive for some applications. We compared the performance of differently constructed ensembles with the performance of CNNs derived from these ensembles using knowledge distillation, a technique for reducing the footprint of large models such as ensembles. - + Approach: We investigated two different types of ensembles, namely, diverse ensembles of networks with three different architectures and two different loss-functions, and uniform ensembles of networks with the same architecture but initialized with different random seeds. For each ensemble, additionally, a single student network was trained to mimic the class probabilities predicted by the teacher model, the ensemble. We evaluated the performance of each network, the ensembles, and the corresponding distilled networks across three different publicly available datasets. These included chest computed tomography scans with four annotated organs of interest, brain magnetic resonance imaging (MRI) with six annotated brain structures, and cardiac cine-MRI with three annotated heart structures. - + Results: Both uniform and diverse ensembles obtained better results than any of the individual networks in the ensemble. Furthermore, applying knowledge distillation resulted in a single network that was smaller and faster without compromising performance compared with the ensemble it learned from. The distilled networks significantly outperformed the same network trained with reference segmentation instead of knowledge distillation. - + Conclusion: Knowledge distillation can compress segmentation ensembles of uniform or diverse composition into a single CNN while maintaining the performance of the ensemble.}, file = {Noot22.pdf:pdf\\Noot22.pdf:PDF}, journal = {Journal of Medical Imaging}, @@ -22616,11 +22616,11 @@ @article{Oei18 pages = {3902-3911}, doi = {10.1007/s00330-018-5353-y}, abstract = {Objectives: To assess observer variability of different reference tissues used for relative CBV (rCBV) measurements in DSC-MRI of glioma patients. - + Methods: In this retrospective study, three observers measured rCBVin DSC-MRimages of 44 glioma patients on two occasions. rCBVis calculated by the CBVin the tumour hotspot/the CBVof a reference tissue at the contralateral side for normalization. One observer annotated the tumour hotspot that was kept constant for all measurements. All observers annotated eight reference tissues of normal white and grey matter. Observer variability was evaluated using the intraclass correlation coefficient (ICC), coefficient of variation (CV) and Bland-Altman analyses. - + Results: For intra-observer, the ICC ranged from 0.50-0.97 (fair-excellent) for all reference tissues. The CV ranged from 5.1-22.1 % for all reference tissues and observers. For inter-observer, the ICC for all pairwise observer combinations ranged from 0.44-0.92 (poor-excellent). The CV ranged from 8.1-31.1 %. Centrum semiovale was the only reference tissue that showed excellent intra- and inter-observer agreement (ICC>0.85) and lowest CVs (<12.5 %). Bland-Altman analyses showed that mean differences for centrum semiovale were close to zero. - + Conclusion: Selecting contralateral centrum semiovale as reference tissue for rCBV provides the lowest observer variability.}, file = {:Oei18 - Observer Variability of Reference Tissue Selection for Relative Cerebral Blood Volume Measurements in Glioma Patients.pdf:PDF}, optnote = {DIAG, RADIOLOGY}, @@ -22672,13 +22672,13 @@ @article{Ogon22 abstract = {Abstract Background Breast terminal duct lobular units (TDLUs), the source of most breast cancer (BC) precursors, are shaped by age-related involution, a gradual process, and postpartum involution (PPI), a dramatic inflammatory process that restores baseline microanatomy after weaning. Dysregulated PPI is implicated in the pathogenesis of postpartum BCs. We propose that assessment of TDLUs in the postpartum period may have value in risk estimation, but characteristics of these tissues in relation to epidemiological factors are incompletely described. - + Methods Using validated Artificial Intelligence and morphometric methods, we analyzed digitized images of tissue sections of normal breast tissues stained with hematoxylin and eosin from donors <= 45 years from the Komen Tissue Bank (180 parous and 545 nulliparous). Metrics assessed by AI, included: TDLU count; adipose tissue fraction; mean acini count/TDLU; mean dilated acini; mean average acini area; mean "capillary" area; mean epithelial area; mean ratio of epithelial area versus intralobular stroma; mean mononuclear cell count (surrogate of immune cells); mean fat area proximate to TDLUs and TDLU area. We compared epidemiologic characteristics collected via questionnaire by parity status and race, using a Wilcoxon rank sum test or Fisher's exact test. Histologic features were compared between nulliparous and parous women (overall and by time between last birth and donation [recent birth: <= 5 years versus remote birth: > 5 years]) using multivariable regression models. - + Results Normal breast tissues of parous women contained significantly higher TDLU counts and acini counts, more frequent dilated acini, higher mononuclear cell counts in TDLUs and smaller acini area per TDLU than nulliparas (all multivariable analyses p < 0.001). Differences in TDLU counts and average acini size persisted for > 5 years postpartum, whereas increases in immune cells were most marked <= 5 years of a birth. Relationships were suggestively modified by several other factors, including demographic and reproductive characteristics, ethanol consumption and breastfeeding duration. - + Conclusions Our study identified sustained expansion of TDLU numbers and reduced average acini area among parous versus nulliparous women and notable increases in immune responses within five years following childbirth. Further, we show that quantitative characteristics of normal breast samples vary with demographic features and BC risk factors. }, @@ -22701,10 +22701,10 @@ @article{Olac20 volume = {71}, abstract = {Purpose: EPID dosimetry in the Unity MR-Linac system allows for reconstruction of absolute dose distributions within the patient geometry. Dose reconstruction is accurate for the parts of the beam arriving at the EPID through the MRI central unattenuated region, free of gradient coils, resulting in a maximum field size of ~10x22 cm2 at isocentre. The purpose of this study is to develop a Deep Learning-based method to improve the accuracy of 2D EPID reconstructed dose distributions outside this central region, accounting for the effects of the extra attenuation and scatter. - + Methods: A U-Net was trained to correct EPID dose images calculated at the isocenter inside a cylindrical phantom using the corresponding TPS dose images as ground truth for training. The model was evaluated using a 5-fold cross validation procedure. The clinical validity of the U-Net corrected dose images (the so-called DEEPID dose images) was assessed with in vivo verification data of 45 large rectum IMRT fields. The sensitivity of DEEPID to leaf bank position errors (+-1.5 mm) and +-5% MU delivery errors was also tested. - + Results: Compared to the TPS, in vivo 2D DEEPID dose images showed an average g-pass rate of 90.2% (72.6%-99.4%) outside the central unattenuated region. Without DEEPID correction, this number was 44.5% (4.0%-78.4%). DEEPID correctly detected the introduced delivery errors . Conclusions: DEEPID allows for accurate dose reconstruction using the entire EPID image, thus enabling dosimetric verification for field sizes up to ~19x22 cm2 at isocentre. The method can be used to detect clinically relevant errors.}, @@ -22847,7 +22847,7 @@ @mastersthesis{Oude19 title = {Reversible Networks for Memory-efficient Image-to-Image Translation in 3D Medical Imaging}, year = {2019}, abstract = {The Pix2pix and CycleGAN losses have vastly improved the qualitative and quantitative visual quality of results in image-to-image translation tasks. We extend this framework by exploring approximately invertible architectures which are well suited to these losses. These architectures are approximately invertible by design and thus partially satisfy cycle-consistency before training even begins. Furthermore, since invertible architectures have constant memory complexity in depth, these models can be built arbitrarily deep. We are able to demonstrate superior quantitative output on the Cityscapes and Maps datasets. - + Additionally, we show that the model allows us to perform several memory-intensive medical imaging tasks, including a super-resolution problem on 3D MRI brain volumes. We also demonstrate that our model can perform a 3D domain-adaptation and 3D super-resolution task on chest CT volumes. By doing this, we provide a proof-of-principle for using reversible networks to create a model capable of pre-processing 3D CT scans to high resolution with a standardized appearance.}, file = {Oude19.pdf:pdf/Oude19.pdf:PDF}, optnote = {DIAG}, @@ -23248,16 +23248,16 @@ @article{Pfob22 year = {2022}, abstract = {Abstract Objectives AI-based algorithms for medical image analysis showed comparable performance to human image readers. However, in practice, diagnoses are made using multiple imaging modalities alongside other data sources. We determined the importance of this multi-modal information and compared the diagnostic performance of routine breast cancer diagnosis to breast ultrasound interpretations by humans or AI-based algorithms. - + Methods Patients were recruited as part of a multicenter trial (NCT02638935). The trial enrolled 1288 women undergoing routine breast cancer diagnosis (multi-modal imaging, demographic, and clinical information). Three physicians specialized in ultrasound diagnosis performed a second read of all ultrasound images. We used data from 11 of 12 study sites to develop two machine learning (ML) algorithms using unimodal information (ultrasound features generated by the ultrasound experts) to classify breast masses which were validated on the remaining study site. The same ML algorithms were subsequently developed and validated on multi-modal information (clinical and demographic information plus ultrasound features). We assessed performance using area under the curve (AUC). - + Results Of 1288 breast masses, 368 (28.6%) were histopathologically malignant. In the external validation set (n = 373), the performance of the two unimodal ultrasound ML algorithms (AUC 0.83 and 0.82) was commensurate with performance of the human ultrasound experts (AUC 0.82 to 0.84; p for all comparisons > 0.05). The multi-modal ultrasound ML algorithms performed significantly better (AUC 0.90 and 0.89) but were statistically inferior to routine breast cancer diagnosis (AUC 0.95, p for all comparisons <= 0.05). - + Conclusions The performance of humans and AI-based algorithms improves with multi-modal information. - + Key Points * The performance of humans and AI-based algorithms improves with multi-modal information. * Multimodal AI-based algorithms do not necessarily outperform expert humans. @@ -23694,13 +23694,13 @@ @article{Pomp16a doi = {10.1016/j.ejrad.2016.09.009}, abstract = {Objectives Airway wall thickness (AWT) is affected by changes in lung volume. This study evaluated whether correcting AWT on computed tomography (CT) for differences in inspiration level improves measurement agreement, reliability, and power to detect changes over time. - + Methods Participants of the Dutch-Belgian lung cancer screening trial who underwent 3-month repeat CT for an indeterminate pulmonary nodule were included. AWT on CT was calculated by the square root of the wall area at a theoretical airway with an internal perimeter of 10?mm (Pi10). The scan with the highest lung volume was labelled as the reference scan and the scan with the lowest lung volume was labelled as the comparison scan. Pi10 derived from the comparison scan was corrected by multiplying it with the ratio of CT lung volume of the comparison scan to CT lung volume on the reference scan. Agreement of uncorrected and corrected Pi10 was studied with the Bland-Altman method, reliability with intra-class correlation coefficients (ICC), and power to detect changes over time was calculated. - + Results 315 male participants were included. Limit of agreement and reliability for Pi10 was ?0.61 to 0.57?mm (ICC?=?0.87), which improved to ?0.38 to 0.37?mm (ICC?=?0.94) after correction for inspiration level. To detect a 15% change over 3 months, 71 subjects are needed for Pi10 and 26 subjects for Pi10 adjusted for inspiration level. - + Conclusions Correcting Pi10 for differences in inspiration level improves reliability, agreement, and power to detect changes over time.}, file = {Pomp16a.pdf:pdf\\Pomp16a.pdf:PDF}, @@ -25294,16 +25294,16 @@ @article{Roo22 abstract = { Background: Surgeons often prefer to use a tourniquet during minor procedures, such as carpal tunnel release (CTR) or trigger finger release (TFR). Besides the possible discomfort for the patient, the effect of tourniquet use on long-term results and complications is unknown. Our primary aim was to compare the patient-reported outcomes 1 year after CTR or TFR under local anesthesia with or without tourniquet. Secondary outcomes included satisfaction, sonographically estimated scar tissue thickness after CTR, and postoperative complications. - - + + Methods: Between May 2019 and May 2020, 163 patients planned for open CTR or TFR under local anesthesia were included. Before surgery, and at 3, 6, and 12 months postoperatively, Quick Disabilities of the Arm, Shoulder and Hand and Boston Carpal Tunnel questionnaires were administered, and complications were noted. At 6 months postoperatively, an ultrasound was conducted to determine the thickness of scar tissue in the region of median nerve. - - + + Results: A total of 142 patients (51 men [38%]) were included. The Quick Disabilities of the Arm, Shoulder and Hand questionnaire and Boston Carpal Tunnel Questionnaire scores improved significantly in both groups during follow-up, wherein most improvements were seen in the first 3 months. No difference in clinical outcome and scar tissue formation was found between the two groups after 12 months. The complication rate was comparable between both groups. Thirty-two (24%) patients had at least one complication, none needed surgical interventions, and no recurrent symptoms were seen. - - + + Conclusions: Our study shows similar long-term clinical outcomes, formation of scar tissue, and complication rates for patients undergoing CTR or TFR with or without a tourniquet. Tourniquet usage should be based on shared decision-making. }, @@ -25454,13 +25454,13 @@ @article{Rutg21 abstract = {Abstract Background Histopathological classification of Wilms tumors determines treatment regimen. Machine learning has been shown to contribute to histopathological classification in various malignancies but requires large numbers of manually annotated images and thus specific pathological knowledge. This study aimed to assess whether trained, inexperienced observers could contribute to reliable annotation of Wilms tumor components for classification performed by machine learning. - + Methods Four inexperienced observers (medical students) were trained in histopathology of normal kidneys and Wilms tumors by an experienced observer (pediatric pathologist). Twenty randomly selected scanned Wilms tumor-slides (from n = 1472 slides) were annotated, and annotations were independently classified by both the inexperienced observers and two experienced pediatric pathologists. Agreement between the six observers and for each tissue element was measured using kappa statistics (k). - + Results Pairwise interobserver agreement between all inexperienced and experienced observers was high (range: 0.845-0.950). The interobserver variability for the different histological elements, including all vital tumor components and therapy-related effects, showed high values for all k-coefficients (> 0.827). - + Conclusions Inexperienced observers can be trained to recognize specific histopathological tumor and tissue elements with high interobserver agreement with experienced observers. Nevertheless, supervision by experienced pathologists remains necessary. Results of this study can be used to facilitate more rapid progress for supervised machine learning-based algorithm development in pediatric pathology and beyond. }, @@ -25573,7 +25573,7 @@ @article{Saha21a doi = {10.1016/j.media.2021.102155}, url = {https://www.sciencedirect.com/science/article/pii/S1361841521002012}, abstract = {We present a multi-stage 3D computer-aided detection and diagnosis (CAD) model for automated localization of clinically significant prostate cancer (csPCa) in bi-parametric MR imaging (bpMRI). Deep attention mechanisms drive its detection network, targeting salient structures and highly discriminative feature dimensions across multiple resolutions. Its goal is to accurately identify csPCa lesions from indolent cancer and the wide range of benign pathology that can afflict the prostate gland. Simultaneously, a decoupled residual classifier is used to achieve consistent false positive reduction, without sacrificing high sensitivity or computational efficiency. In order to guide model generalization with domain-specific clinical knowledge, a probabilistic anatomical prior is used to encode the spatial prevalence and zonal distinction of csPCa. Using a large dataset of 1950 prostate bpMRI paired with radiologically-estimated annotations, we hypothesize that such CNN-based models can be trained to detect biopsy-confirmed malignancies in an independent cohort. - + For 486 institutional testing scans, the 3D CAD system achieves 83.69+-5.22% and 93.19+-2.96% detection sensitivity at 0.50 and 1.46 false positive(s) per patient, respectively, with 0.882+-0.030 AUROC in patient-based diagnosis -significantly outperforming four state-of-the-art baseline architectures (U-SEResNet, UNet++, nnU-Net, Attention U-Net) from recent literature. For 296 external biopsy-confirmed testing scans, the ensembled CAD system shares moderate agreement with a consensus of expert radiologists (76.69%; kappa = 0.51+-0.04) and independent pathologists (81.08%; kappa = 0.56+-0.06); demonstrating strong generalization to histologically-confirmed csPCa diagnosis.}, optnote = {DIAG, RADIOLOGY}, algorithm = {https://grand-challenge.org/algorithms/prostate-mri-cad-cspca/}, @@ -26082,7 +26082,7 @@ @conference{Sand19 conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence. - + Introduction The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in @@ -26092,7 +26092,7 @@ @conference{Sand19 detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was developed. In this study we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and breast imaging reporting and data system (BI-RADS ) score. - + Methods From September 2017 to August 2018, 15 women with known breast cancer or suspicious breast lesions were included, after providing signed informed consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM @@ -26105,13 +26105,13 @@ @conference{Sand19 and 10 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible), and a BI-RADS score (1 to 5) was given for each lesion. Statistical analysis was performed in SPSS using the Wilcoxon signed-rank test. - + Results Results are presented in Table 1. Overall IQ was significantly higher for the conventional rs-DWI-EPI than for the SMS-ss-DWI-EPI (p=0.006). Lesion conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.016). Benign lesions had similar conspicuity with both sequences while malignant lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.027) (for example, see Figure 1). There was no significant difference in BI-RADS scores (p=0.151) between the two sequences. - + Discussion Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI. @@ -26120,7 +26120,7 @@ @conference{Sand19 sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is needed for improved IQ and even better lesion conspicuity. Extension of the data pool and evaluation by additional readers is pending. - + Conclusion Despite the perceived poorer image quality of the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.}, @@ -26153,19 +26153,19 @@ @conference{Sand20 title = {Simultaneous multi-slice single-shot DWI compared to routine read-out-segmented DWI for evaluation of breast lesions}, abstract = {Synopsis The aim of this study was to compare a prototype simultaneous multi-slice single-shot echo planar imaging (SMS-ss-DWI-EPI) sequence with conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence. - + Introduction The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in most state-of-the-art breast MRI protocols. The echo train of the readout-segmented echo-planar imaging-based DWI sequence (rs-DWI-EPI) was shortened to reduce distortion and improve the resulting image quality. However, this sequence results in a lower signal-to-noise ratio (SNR) than single-shot echo planar imaging (ss-EPI). In practice, detection of lesions on DWI is often problematic due to a relatively low lesion conspicuity. To improve the detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was developed. In this study, we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and the presence of artifacts. - + Methods From September 2017 to December 2018, 25 women with known breast cancer or suspicious breast lesions were included, after providing signed informed consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM Skyra system (Siemens Healthcare, Erlangen, Germany) using a 16-channel bilateral breast coil. Parameters of the rs-DWI-EPI sequence were: TR: 5450 ms, TE: 57 ms, FoV: 340 mm, voxel size: 1.2x1.2x5 mm , acquisition time: 4:23 min, b-values: 50, 850 s/mm , SPAIR fat suppression. Parameters of the SMS-ss-DWI-EPI sequence were: TR: 4000 ms, TE: 70 ms, FoV: 360 mm, voxel size: 0.9(i)x0.9(i)x4 mm, acquisition time: 2:45 min, b-values: 50, 400, 800 s/mm , SPAIR fat suppression. In addition, the clinical protocol included one pre- and five post-contrast regular T1-weighted Dixon acquisitions, ultrafast T1-weighted TWIST acquisitions during the inflow of contrast, and a T2 weighted Dixon acquisition. In total, 42 malignant (32 invasive ductal carcinomas, 4 invasive lobular carcinomas, 1 ductal carcinoma in situ and 5 other malignant lesions) and 12 benign lesions were detected on the contrast-enhanced series. Malignant lesions had a mean MRI size of 18.7 mm +- 15.1 mm (range: 3 - 92 mm) and benign lesions had a mean size of 5.9 mm +- 3.8 mm (range: 3 - 15 mm). Four dedicated breast radiologists (4 to 15 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible). Statistical analysis was performed in SPSS using Generalized Linear Models and the Wilcoxon signed-rank test. - + Results Overall IQ was significantly higher for the conventional rs-DWI-EPI (Mean +- SD: 5.5 +- 1.9) than for the SMS-ss-DWI-EPI (Mean +- SD: 4.2 +- 2.0) (p=0.002). Lesion conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.009). Benign lesions had similar conspicuity with both sequences while malignant lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.041) (for example, see Figure 1). Infolding and ghosting artifacts were scored as disturbing or worse by 2 or more radiologists in 6 and 15 cases, for Resolve and SMS respectively. Distortion artifacts were scored as disturbing or worse in 4 and 17 cases, respectively. - + Discussion: Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI. Simultaneous Multi-Slice (SMS) ensures that slices are excited simultaneously with a multiband pulse, which leads to a reduced acquisition time. In our protocol, the combination of ss-EPI and SMS results in a higher spatial resolution while still having a shorter acquisition time than the conventional sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is needed for improved IQ, decreased presence of artifacts and even better lesion conspicuity. - + Conclusion Despite the perceived poorer image quality and the more disturbing presence of artifacts in the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.}, optnote = {DIAG}, @@ -26857,16 +26857,16 @@ @article{Scha21a abstract = {Abstract Objectives The individual course of disease in idiopathic pulmonary fibrosis (IPF) is highly variable. Assessment of disease activity and prospective estimation of disease progression might have the potential to improve therapy management and indicate the onset of treatment at an earlier stage. The aim of this study was to evaluate whether regional ventilation, lung perfusion, and late enhancement can serve as early imaging markers for disease progression in patients with IPF. - + Methods In this retrospective study, contrast-enhanced dual-energy CT scans of 32 patients in inspiration and delayed expiration were performed at two time points with a mean interval of 15.4 months. The pulmonary blood volume (PBV) images obtained in the arterial and delayed perfusion phase served as a surrogate for arterial lung perfusion and parenchymal late enhancement. The virtual non-contrast (VNC) images in inspiration and expiration were non-linearly registered to provide regional ventilation images. Image-derived parameters were correlated with longitudinal changes of lung function (FVC%, DLCO%), mean lung density in CT, and CT-derived lung volume. - + Results Regional ventilation and late enhancement at baseline preceded future change in lung volume (R - 0.474, p 0.006/R - 0.422, p 0.016, respectively) and mean lung density (R - 0.469, p 0.007/R - 0.402, p 0.022, respectively). Regional ventilation also correlated with a future change in FVC% (R - 0.398, p 0.024). - + Conclusion CT-derived functional parameters of regional ventilation and parenchymal late enhancement are potential early imaging markers for idiopathic pulmonary fibrosis progression. - + Key Points * Functional CT parameters at baseline (regional ventilation and late enhancement) correlate with future structural changes of the lung as measured with loss of lung volume and increase in lung density in serial CT scans of patients with idiopathic pulmonary fibrosis. * Functional CT parameter measurements in high-attenuation areas (- 600 to - 250 HU) are significantly different from normal-attenuation areas (- 950 to - 600 HU) of the lung. @@ -26925,16 +26925,16 @@ @article{Scha22b abstract = {Abstract Objectives Idiopathic pulmonary fibrosis (IPF) is a disease with a poor prognosis and a highly variable course. Pathologically increased ventilation--accessible by functional CT--is discussed as a potential predecessor of lung fibrosis. The purpose of this feasibility study was to investigate whether increased regional ventilation at baseline CT and morphological changes in the follow-up CT suggestive for fibrosis indeed occur in spatial correspondence. - + Methods In this retrospective study, CT scans were performed at two time points between September 2016 and November 2020. Baseline ventilation was divided into four categories ranging from low, normal to moderately, and severely increased (C1-C4). Correlation between baseline ventilation and volume and density change at follow-up was investigated in corresponding voxels. The significance of the difference of density and volume change per ventilation category was assessed using paired t-tests with a significance level of p <= 0.05. The analysis was performed separately for normal (NAA) and high attenuation areas (HAA). - + Results The study group consisted of 41 patients (73 +- 10 years, 36 men). In both NAA and HAA, significant increases of density and loss of volume were seen in areas of severely increased ventilation (C4) at baseline compared to areas of normal ventilation (C2, p < 0.001). In HAA, morphological changes were more heterogeneous compared to NAA. - + Conclusion Functional CT assessing the extent and distribution of lung parenchyma with pathologically increased ventilation may serve as an imaging marker to prospectively identify lung parenchyma at risk for developing fibrosis. - + Key Points * Voxelwise correlation of serial CT scans suggests spatial correspondence between increased ventilation at baseline and structural changes at follow-up. * Regional assessment of pathologically increased ventilation at baseline has the potential to prospectively identify tissue at risk for developing fibrosis. @@ -28101,9 +28101,9 @@ @article{Sech20 title = {Artificial Intelligence for Breast Cancer Detection in Mammography: state of the art}, doi = {10.1016/j.semcancer.2020.06.002}, abstract = {Screening for breast cancer with mammography has been introduced in various countries over the last 30 years, initially using analog screen-film-based systems and, over the last 20 years, transitioning to the use of fully digital systems. With the introduction of digitization, the computer interpretation of images has been a subject of intense interest, resulting in the introduction of computer-aided detection (CADe) and diagnosis (CADx) algorithms in the early 2000's. Although they were introduced with high expectations, the potential improvement in the clinical realm failed to materialize, mostly due to the high number of false positive marks per analyzed image. - + In the last five years, the artificial intelligence (AI) revolution in computing, driven mostly by deep learning and convolutional neural networks, has also pervaded the field of automated breast cancer detection in digital mammography and digital breast tomosynthesis. Research in this area first involved comparison of its capabilities to that of conventional CADe/CADx methods, which quickly demonstrated the potential of this new technology. In the last couple of years, more mature and some commercial products have been developed, and studies of their performance compared to that of experienced breast radiologists are showing that these algorithms are on par with human-performance levels in retrospective data sets. Although additional studies, especially prospective evaluations performed in the real screening environment, are needed, it is becoming clear that AI will have an important role in the future breast cancer screening realm. Exactly how this new player will shape this field remains to be determined, but recent studies are already evaluating different options for implementation of this technology. - + The aim of this review is to provide an overview of the basic concepts and developments in the field AI for breast cancer detection in digital mammography and digital breast tomosynthesis. The pitfalls of conventional methods, and how these are, for the most part, avoided by this new technology, will be discussed. Importantly, studies that have evaluated the current capabilities of AI and proposals for how these capabilities should be leveraged in the clinical realm will be reviewed, while the questions that need to be answered before this vision becomes a reality are posed.}, file = {Sech20.pdf:pdf/Sech20.pdf:PDF}, journal = {Seminars in Cancer Biology}, @@ -28345,13 +28345,13 @@ @article{Sier20 pages = {3198-3209}, abstract = {Objectives The diagnostic reading of follow-up low-dose whole-body computed tomography (WBCT) examinations in patients with multiple myeloma (MM) is a demanding process. This study aimed to evaluate the diagnostic accuracy and benefit of a novel software program providing rapid-subtraction maps for bone lesion change detection. - + Methods Sixty patients (66 years +- 10 years) receiving 120 WBCT examinations for follow-up evaluation of MM bone disease were identified from our imaging archive. The median follow-up time was 292 days (range 200-641 days). Subtraction maps were calculated from 2-mm CT images using a nonlinear deformation algorithm. Reading time, correctly assessed lesions, and disease classification were compared to a standard reading software program. De novo clinical reading by a senior radiologist served as the reference standard. Statistics included Wilcoxon rank-sum test, Cohen's kappa coefficient, and calculation of sensitivity, specificity, positive/negative predictive value, and accuracy. - + Results Calculation time for subtraction maps was 84 s +- 24 s. Both readers reported exams faster using subtraction maps (reader A, 438 s +- 133 s; reader B, 1049 s +- 438 s) compared to PACS software (reader A, 534 s +- 156 s; reader B, 1486 s +- 587 s; p < 0.01). The course of disease was correctly classified by both methods in all patients. Sensitivity for lesion detection in subtraction maps/conventional reading was 92%/80% for reader A and 88%/76% for reader B. Specificity was 98%/100% for reader A and 95%/96% for reader B. - + Conclusion A software program for the rapid-subtraction map calculation of follow-up WBCT scans has been successfully tested and seems suited for application in clinical routine. Subtraction maps significantly facilitated reading of WBCTs by reducing reading time and increasing sensitivity.}, file = {Sier20.pdf:pdf\\Sier20.pdf:PDF}, @@ -28513,16 +28513,16 @@ @mastersthesis{Sloo20 title = {Patient variables related to false predictions of deep-learning assisted prostate cancer detection in MRI}, abstract = {Background: DL-CAD for prediction of clinically significant prostate cancer (csPCa) in mpMRI is developed to aid radiologists in PI-RADS evaluation. DL-CAD predictions have low accuracy, possibly due to clinical risk factors of csPCa that are not taken into account by DL-CAD. - + Purpose: Aim is to identify patient subgroups of clinical characteristics in which DL-CAD predictions differ from radiologists. - + Methods: DL-CAD was applied to a test cohort of men examined for PCa according to PI-RADSv2 between 2016 and 2017. Ground truth was provided by manually annotated PI-RADS >=4 lesions. Patient age and PSA were derived from the electronic patient record and other variables were mined from the written radiological reports. False and correct predicted patients were compared on variable distributions and false positive rates were compared between variable categories. - + Results: CsPCa was predicted for a total of 482 men (36.9% PIRADS >=4). Benign and malignant patients statistically differed on all clinical variables (P<.05). DL-CAD negative predictive value and positive predictive value were 0.912 and 0.457, respectively. False and correct positive predicted patients significantly differed on age (P<.05), PSA (P<.001), and PSAD (P<.001) as well as prostate volume (P<.001), number of lesions (P<.001), and number of affected zones (P<.001). Analysis of negative predictions was inconclusive due to small population size. - + Conclusions: False positive DL-CAD csPCa predictions are due to unavailable clinical variables that are used in radiologists' PI-RADS risk assessment. We advise to study the effect of including age, PSA and PSAD information in DL-CAD input on prediction accuracy.}, file = {:pdf/Sloo20.pdf:PDF}, @@ -28702,22 +28702,22 @@ @conference{Smee18 title = {Tumor heterogeneity as a PET-biomarker predicts overall survival of pancreatic cancer patients}, abstract = {INTRODUCTION Pancreatic ductal adenocarcinoma (PDAC) shows a 5-year survival rate of 8%[1]. This mortality results from a lack of methods to accurately treat patients[2]. PDAC is remarkable for its fibrotic reaction, which is present at early stages of PDAC development[3]. Components of this environment can be measured on clinical images[4]. PET derived parameters, e.g. SUVmax, have not been able to provide prognostic information. In this study we developed an algorithm based on FDG-PET texture features (TF) that classifies heterogeneous or homogeneous tumors and shows a correlation with overall survival. - - + + METHODS In total, 121 patients with histologically proven PDAC who underwent 18F-FDG PET/CT (Siemens Biograph mCT, Knoxville, US) were selected from the hospital system. Eighty-six EANM reconstructed scans were visually labeled as 'homogenous' or 'heterogeneous' by experienced Nuclear Medicine physicians and served as training set to develop the classifier [5]. All the 121 scans were used as validation set for the correlation with overall survival (OS). Tumors were delineated using 40% threshold of the SUVmax with manual correction. TF were extracted using the PyRadiomcis toolbox [6]. TF were selected and tested for robustness as described in literature [7-9]. The classifier was build using logistic regression. Prognostic impact was assessed by Kaplan Meier survival analysis and log-rank test. - - + + RESULTS Optimal performance of the leave-one-out cross-validation classifier in the training set yielded an accuracy of 0.73 and AUC of 0.71 in classifying PDAC as heterogeneous or homogeneous tumors. Of note, two tumors were visually labeled as homogenous but correctly classifier as heterogeneous by the classifier after review. For the 121 patients the OS of PDAC tumors classified as heterogeneous, was significantly worse than for homogeneous tumors; median OS 69 weeks (95%CI 64 to 91 weeks) versus median 95 weeks (95%CI 76 to 114), p= 0.0285). This is in contrast with single standard PET parameters, single TF or manual labeling, which had no significant prognostic impact. - - + + CONCLUSIONS We developed an algorithm that accurately classifies PDAC as heterogeneous or homogeneous, based on a set of 18F-FDG PET derived texture features. We showed that the classification result has prognostic value, improving upon standard PET derived parameters and single texture-features. Further validation of this algorithm in an external cohort of PDAC patients is ongoing. - - + + REFERENCES - + [1] Siegel, R.L., K.D. Miller, and A. Jemal, Cancer statistics, 2016. CA Cancer J Clin, 2016. 66(1): p. 7-30. [2] Ryan, D.P., T.S. Hong, and N. Bardeesy, Pancreatic adenocarcinoma. N Engl J Med, 2014. 371(11): p. 1039-49. [3] Neesse, A., et al., Stromal biology and therapy in pancreatic cancer: a changing paradigm. Gut, 2015. 64(9): p. 1476-84. @@ -28848,13 +28848,13 @@ @article{Smit23 doi = {https://doi.org/10.1016/j.jpi.2023.100191}, abstract = {Background The amount of stroma within the primary tumor is a prognostic parameter for colon cancer patients. This phenomenon can be assessed using the tumor-stroma ratio (TSR), which classifies tumors in stroma-low (<=50% stroma) and stroma-high (>50% stroma). Although the reproducibility for TSR determination is good, improvement might be expected from automation. The aim of this study was to investigate whether the scoring of the TSR in a semi- and fully automated method using deep learning algorithms is feasible. - + Methods A series of 75 colon cancer slides were selected from a trial series of the UNITED study. For the standard determination of the TSR, 3 observers scored the histological slides. Next, the slides were digitized, color normalized, and the stroma percentages were scored using semi- and fully automated deep learning algorithms. Correlations were determined using intraclass correlation coefficients (ICCs) and Spearman rank correlations. - + Results 37 (49%) cases were classified as stroma-low and 38 (51%) as stroma-high by visual estimation. A high level of concordance between the 3 observers was reached, with ICCs of 0.91, 0.89, and 0.94 (all P<.001). Between visual and semi-automated assessment the ICC was 0.78 (95% CI 0.23-0.91, P-value 0.005), with a Spearman correlation of 0.88 (P<.001). Spearman correlation coefficients above 0.70 (N=3) were observed for visual estimation versus the fully automated scoring procedures. - + Conclusion Good correlations were observed between standard visual TSR determination and semi- and fully automated TSR scores. At this point, visual examination has the highest observer agreement, but semi-automated scoring could be helpful to support pathologists.}, file = {Smit23.pdf:pdf\\Smit23.pdf:PDF}, @@ -29102,13 +29102,13 @@ @conference{Spro22 year = {2022}, abstract = {Background Immunotherapy has become the standard of care for metastatic non-small cell lung cancer (mNSCLC) without a targetable driver alteration, yet we still lack insight into which patients (pts) will benefit from such treatments. To that end, we investigated characteristics of the immune infiltrate in the tumor microenvironment in relation to immunotherapy response. We report the results of an automated deep learning approach applied to digital H&E whole slide images (WSIs) of pre-treatment biopsies from the PEMBRO-RT clinical trial. - + Methods 61 quality-checked H&E WSIs were processed with 3 deep learning algorithms. We extracted a tissue mask using an existing method (Bandi et al., 2019), and detected tumor and immune cells using HoVerNet (Graham et al., 2019). Tumor clusters were identified by combining the output of HoVerNet and tumor segmentation from an nnUnet (Isensee et al., 2021) model that we trained on external NSCLC images. From the output of this pipeline, we extracted immune infiltrate-based density metrics, calculated over all tissue (allINF), stroma within 500um from the tumor border (sINF), tumor region (tINF), and the combination of stroma and tumor (t+sINF). All metrics were used in ROC analysis after dichotomizing pts as responders and non-responders (response was defined as complete or partial response at any time point or stable disease for >=12 weeks according to RECIST 1.1 measurement). Differences in metric distributions between the two groups were tested with a two-sided Welch t-test. Kaplan-Meier (KM) analysis was performed on progression-free survival (5-year follow-up). - + Results Our automated analysis reported denser immune infiltrates in responders, although not statistically significant (0.05 0.63, where tINF reported an AUC of 0.70. KM analysis showed p=0.07 if pts were stratified based on the median tINF, and p=0.02 if stratified based on the optimal operating point of its ROC curve. - + Conclusions Deep learning models that analyze the immune infiltrate density on H&E WSIs can identify mNSCLC responders to pembrolizumab.}, optnote = {DIAG, RADIOLOGY}, @@ -29451,7 +29451,7 @@ @conference{Stoi17a effect of the normalization was determined by computing and comparing the diagnostic accuracy using ROC analysis. Results The area under the ROC AUC was significantly higher p<0.05 in normalized T2.5/22/2017 #542: Feasibility of multireference tissue normalization of T2weighted prostate MRI. - + Discussion / Conclusion The significant improvement of the diagnostic accuracy demonstrates the potential of our normalization method for the quantitative interpretation of T2-weighted prostate MRI. The results were similar to our previous method.The method still requires manual delineation of multiple reference tissues, however, we will develop deep learning segmentation methods to automate the method and enable regular clinical use. @@ -29617,11 +29617,11 @@ @conference{Stud20 year = {2020}, optnote = {DIAG, RADIOLOGY}, abstract = {Background & objectives: Tumour budding, and T-cells are robust prognostic biomarkers in colorectal cancer. A combined analysis is complex and can be greatly expedited and automated using deep learning. The implementation of computer-based analysis in diagnostics is challenging and necessitates extensive validation. - + Methods: Randomly selected (n=61) double-stained immunohistochemical slides (AE1-AE3 pancytokeratin for tumour buds and CD8 for cytotoxic T-cells) from our pT1 cohort from 3 different institutions were used to validate the deep learning algorithms for tumour budding and CD8 T-cell detection developed by the International Budding Consortium Computational Pathology Group. Staining and scanning were performed in a single laboratory. - + Results: In the visually identified tumour budding hotspot (0.785 mm2), tumour buds were manually annotated, and the output of the T-cell algorithm manually corrected by a single observer. For budding, 645 out of the 1'306 buds were correctly identified by the algorithm. Recall and precision were 49.4% and 61.4%, respectively. For the T-cells, 89.3% were correctly detected (from a total of 16'296). The recall was 90.3% and the precision was 87.3%. Reasons for misclassified T-cells included staining intensity, suboptimal tissue recognition and slide artifacts. - + Conclusion: Our preliminary data demonstrates satisfactory results for T-cell detection. Automated budding detection is more difficult, as inter-observer variability of bud calling is high among experts. These issues merit consideration when developing reliable deep learning algorithms examining the tumour/host interface.}, } @@ -29634,13 +29634,13 @@ @inproceedings{Stud22 file = {Stud22.pdf:pdf/Stud22.pdf:PDF}, abstract = {Introduction As pT1 colorectal cancers (CRC) tend to be overtreated, we investigate the previously proposed BTS (budding-T-cell-score = (#tumor-buds+1)/(#T-cells+1)) as a predictive marker to assess patients' need for resection. BTS was shown to be a better predictor of survival and other clinical factors than individual scoring. - + Materials and Methods We consider hotspots annotated by a pathologist according to the ITBCC guidelines on double-stained (AE1-AE3 pan-cytokeratin and CD8+) WSI from our pT1 CRC cohort (N=573). Within hotspots, tumor-buds and T-cells are automatically detected using convolutional neural networks and counted. The patients are divided into two groups based on their need for resection (no: N0 / follow-up without recurrence; yes: N1 / follow-up with recurrence). The dataset is imbalanced (89.2%/10.8%). To predict the patient group, we train a support-vector machine with data-balancing using the tumor-buds or T-cell counts individually, together, and just the BTS. We report the weighted accuracy, and sensitivity and specificity for the "yes" group. - + Results The highest weighted accuracy (62.8Tu 6.5%) and precision (17.6Tu 3.7%) are achieved using the tumor-buds count. Using the BTS achieves a sensitivity of 98.3Tu 2.9%, which outperforms the other models by more than 30%. - + Conclusion We show that combined assessment of tumor-buds and T-cells has the potential to serve as a predictive marker for the need of resection in pT1 cancers. However, there is still much room for improvement, as the low specificity still leads to overtreatment. We aim to address this in future work by also considering the spatial relationship of tumor-buds and T-cells and other predictive factors of nodal metastasis.}, } @@ -30126,7 +30126,7 @@ @article{Tan16 in ABUS is challenging since lesion edges might not be well defined. In this study, the authors aim at developing an automated segmentation method for malignant lesions in ABUS that is robust to ill-defined cancer edges and posterior shadowing. - + Methods: A segmentation method using depth-guided dynamic programming based on spiral scanning is proposed. The method automatically adjusts aggressiveness of the segmentation according to the position of the voxels relative to the lesion center. Segmentation is more aggressive in the @@ -30135,12 +30135,12 @@ @article{Tan16 for evaluation. The proposed method is compared to existing state of the art approaches such as graph cut, level set, and smart opening and an existing dynamic programming method without depth dependence. - + Results: In a dataset of 78 cancers, our proposed segmentation method achieved a mean Dice of 0.73+-0.14. The method outperforms an existing dynamic programming method (0.70+-0.16) on this task (p = 0.03) and it is also significantly (p < 0.001) better than graph cut (0.66+-0.18), level set based approach (0.63+-0.20) and smart opening (0.65+-0.12). - + Conclusions: The proposed depth-guided dynamic programming method achieves accurate breast malignant lesion segmentation results in automated breast ultrasound.}, file = {Tan16.pdf:pdf\\Tan16.pdf:PDF}, @@ -30334,16 +30334,16 @@ @article{Terh21 abstract = {Abstract Background Recruiting asymptomatic participants with early disease stages into studies is challenging and only little is known about facilitators and barriers to screening and recruitment of study participants. Thus we assessed factors associated with screening rates in the MACUSTAR study, a multi-centre, low-interventional cohort study of early stages of age-related macular degeneration (AMD). - + Methods Screening rates per clinical site and per week were compiled and applicable recruitment factors were assigned to respective time periods. A generalized linear mixed-effects model including the most relevant recruitment factors identified via in-depth interviews with study personnel was fitted to the screening data. Only participants with intermediate AMD were considered. - + Results A total of 766 individual screenings within 87 weeks were available for analysis. The mean screening rate was 0.6 +- 0.9 screenings per week among all sites. The participation at investigator teleconferences (relative risk increase 1.466, 95% CI [1.018-2.112]), public holidays (relative risk decrease 0.466, 95% CI [0.367-0.591]) and reaching 80% of the site's recruitment target (relative risk decrease 0.699, 95% CI [0.367-0.591]) were associated with the number of screenings at an individual site level. - + Conclusions Careful planning of screening activities is necessary when recruiting early disease stages in multi-centre observational or low-interventional studies. Conducting teleconferences with local investigators can increase screening rates. When planning recruitment, seasonal and saturation effects at clinical site level need to be taken into account. - + Trial registration ClinicalTrials.govNCT03349801. Registered on 22 November 2017. }, @@ -30441,7 +30441,7 @@ @conference{Teuw17b abstract = {PURPOSE In this study we evaluated the potential of a computer system to select exams with low likelihood of containing cancer. - + METHOD AND MATERIALS We collected a representative set of 1649 referrals with different screening outcome from the Dutch breast cancer screening. The dataset comprised 489 true positives (TP) exams and 1160 false @@ -30452,18 +30452,18 @@ @conference{Teuw17b computerized score represents the likelihood that a cancer is present in the exam at hand, where 10 represents the highest likelihood that a cancer is present. It is defined in such a way that, in a screening setting, the number of mammograms in each category is roughly equal. - + In this study, we determined the distribution of the computerized cancer likelihood scores for the TP, FP and TN exams. In particular we quantified for each category the fraction of cases with a cancer likelihood score below or equal to 5, including about 50% of the mammograms. Additionally we evaluated the positive predictive value (PPV) of referrals in each likelihood category. - + RESULTS 5.11% of the TPs, 20.3% of the FPs and 45.0% of the TNs were assigned to the likelihood categories 1 to 5. This corresponds to 0.7 cancers per 1000 in the group with score 1-5 and 11.2 per 1000 with a score higher than 5, based on the cancer detection rate of 6.5/1000 in the Dutch screening program. The PPV was 8.00%, 8.14%, and 44.9% for cancer likelihood scores 1, 5 and 10, respectively. - + CONCLUSION Automated identification of a fraction of screening mammograms that most likely are normal is feasible.}, @@ -30754,13 +30754,13 @@ @article{Tura21 abstract = {Abstract Objectives Over 2500 percutaneous transhepatic cholangiography and biliary drainage (PTCD) procedures are yearly performed in the Netherlands. Most interventions are performed for treatment of biliary obstruction following unsuccessful endoscopic biliary cannulation. Our aim was to evaluate complication rates and risk factors for complications in PTCD patients after failed ERCP. - + Methods We performed an observational study collecting data from a cohort that was subjected to PTCD during a 5-year period in one academic and four teaching hospitals. Primary objective was the development of infectious (sepsis, cholangitis, abscess, or cholecystitis) and non-infectious complications (bile leakage, severe hemorrhage, etc.) and mortality within 30 days of the procedure. Subsequently, risk factors for complications and mortality were analyzed with a multilevel logistic regression analysis. - + Results A total of 331 patients underwent PTCD of whom 205 (61.9%) developed PTCD-related complications. Of the 224 patients without a pre-existent infection, 91 (40.6%) developed infectious complications, i.e., cholangitis in 26.3%, sepsis in 24.6%, abscess formation in 2.7%, and cholecystitis in 1.3%. Non-infectious complications developed in 114 of 331 patients (34.4%). 30-day mortality was 17.2% (N = 57). Risk factors for infectious complications included internal drainage and drain obstruction, while multiple re-interventions were a risk factor for non-infectious complications. - + Conclusion Both infectious and non-infectious complications are frequent after PTCD, most often due to biliary drain obstruction. }, @@ -30981,13 +30981,13 @@ @conference{Valk19 title = {Familial discordance in disease phenotype in siblings with Stargardt disease}, abstract = {Purpose: To investigate intersibling discordance of the Stargardt disease (STGD1) phenotype. - + Methods: We performed a retrospective cohort study among siblings with genetically confirmed STGD1 and at least one available fundus autofluorescence (FAF) image of both eyes. We compared age of onset within families using the youngest patient as the reference and a predetermined threshold value of 10 years for significant differences. Disease duration was matched to investigate differences in best-corrected visual acuity, and we determined and compared the survival time for reaching severe visual impairment (SVI); (<20/200 Snellen or > 1.3 Logarithm of the Minimal Angle of Resolution (LogMAR)). Central retinal atrophy surface area was quantified and compared by two independent graders using the semi-automated EyeNED software. Additionally, both graders performed qualitative assessment of FAF patterns to identify phenotypic differences and commonalities. Main outcome measures included differences in age of onset, best-corrected visual acuity (BCVA), time to develop legal blindness, FAF atrophy surface area and autofluorescence patterns. - + Results: Significant differences in age of onset were present in 5/17 families, ranging from 13 to 39 years. BCVA was matched in 12/17 families and the median difference was 0.41 (0 - 1.10) LogMAR for the right and 0.41 (0 - 1.08) LogMAR for the left eye, and we found extreme differences in five families ranging from 0.58 to 1.1 LogMAR. The median age at which patients developed SVI was 14 years. We observed significant differences in time to develop SVI in three out of 12 families with matched survival times, ranging from 14 to 29 years. Median central retinal atrophy surface area was 11.38 mm2 in the right (range 1.98 - 44.78 mm2) and 10.59 mm2 in the left (range 1.61 - 40.59 mm2) eyes and was highly comparable between siblings, with the exception of family one. Qualitative FAF phenotypes were comparable in all sibling pairs. - + Conclusions: Phenotypic discordance between siblings with STGD1 disease carrying the same ABCA4 variants is a prevalent phenomenon. Functional outcomes can differ substantially despite highly comparable FAF phenotypes, which complicates sibling-based prognosis. While environmental factor are likely to modify the disease course, the relatively young median age at which patients develop SVI indicates an important role for genetic factors as disease modifiers.}, optnote = {DIAG, RADIOLOGY}, @@ -31007,22 +31007,22 @@ @article{Valk19a url = {https://www.sciencedirect.com/science/article/pii/S0161642019306578?via%3Dihub}, abstract = {Purpose To investigate intersibling phenotypic concordance in Stargardt disease (STGD1). - + Design Retrospective cohort study. - + Participants Siblings with genetically confirmed STGD1 and at least 1 available fundus autofluorescence (FAF) image of both eyes. - + Methods We compared age at onset within families. Disease duration was matched to investigate differences in best-corrected visual acuity (BCVA) and compared the survival time for reaching severe visual impairment (<20/200 Snellen or >1.0 logarithm of the minimum angle of resolution [logMAR]). Central retinal atrophy area was quantified independently by 2 experienced graders using semiautomated software and compared between siblings. Both graders performed qualitative assessment of FAF and spectral-domain (SD) OCT images to identify phenotypic differences. - + Main Outcome Measures Differences in age at onset, disease duration-matched BCVA, time to severe visual impairment development, FAF atrophy area, FAF patterns, and genotypes. - + Results Substantial differences in age at onset were present in 5 of 17 families, ranging from 13 to 39 years. Median BCVA at baseline was 0.60 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/80 [range, 20/12-hand movements]) in the right eye and 0.50 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/63 [range, 20/12-hand movements]) in the left eye. Disease duration-matched BCVA was investigated in 12 of 17 families, and the median difference was 0.41 logMAR (range, 0.00-1.10 logMAR) for the right eye and 0.41 logMAR (range, 0.00-1.08 logMAR) for the left eye. We observed notable differences in time to severe visual impairment development in 7 families, ranging from 1 to 29 years. Median central retinal atrophy area was 11.38 mm2 in the right eye (range, 1.98-44.78 mm2) and 10.59 mm2 in the left eye (range, 1.61-40.59 mm2) and highly comparable between siblings. Similarly, qualitative FAF and SD OCT phenotypes were highly comparable between siblings. - + Conclusions Phenotypic discordance between siblings with STGD1 carrying the same ABCA4 variants is a prevalent phenomenon. Although the FAF phenotypes are highly comparable between siblings, functional outcomes differ substantially. This complicates both sibling-based prognosis and genotype-phenotype correlations and has important implications for patient care and management.}, file = {Valk19a.pdf:pdf\\Valk19a.pdf:PDF}, @@ -31592,13 +31592,13 @@ @article{Ven16a url = {http://dx.doi.org/10.1016/j.clinimag.2016.02.005}, abstract = {Objectives To determine TRUS visibility of MR lesions. - + Methods Data from 34 patients with 56 MR lesions and prostatectomy was used. Five observers localized and determined TRUS visibility during retrospective fusion. Visibility was correlated to PIRADS and Gleason scores. - + Results TRUS visibility occurred in 43% of all MR lesions and 62% of PIRADS 5 lesions. Visible lesions had a significantly lower localization variability. On prostatectomy, 58% of the TRUS visible lesions had a Gleason 4 or 5 component. - + Conclusions Almost half of the MR lesions were visible on TRUS. TRUS visible lesions were more aggressive than TRUS invisible lesions.}, file = {Ven16a.pdf:pdf\\Ven16a.pdf:PDF}, @@ -31760,15 +31760,15 @@ @conference{Venh17 The obtained results were compared to manual annotations made by two experienced human graders in consensus for the central 3 mm surrounding the fovea. Hyperreflective foci were only annotated in the layers ranging from the inner plexiform layer ({IPL}) to the outer nuclear layer ({ONL}) as manual detection is challenging in the other layers. When a detection is overlapping with an annotated focus it is considered a true positive, otherwise it is counted as a false positive. - + Results: - + In the independent test set a sensitivity of 0.83 was obtained. At this level of sensitivity, an average of 8.3 false positives per {B}-scan were detected. False positives were mainly caused by detections outside the selected range ({ILP} to {ONL}) and misdetections by the graders. - + Conclusions: - + An image analysis algorithm for the automatic detection and quantification of hyperreflective foci in {OCT} {B}-scans was developed. The experiments show promising results to obtain quantitative foci based biomarkers that can be used for the prediction of treatment response in {DME}.}, optnote = {DIAG, RADIOLOGY}, @@ -31929,17 +31929,17 @@ @conference{Venk22 booktitle = ECR, title = {Deep learning for estimating pulmonary nodule malignancy risk using prior CT examinations in lung cancer screening}, abstract = {Purpose or Learning Objective: Nodule size, morphology, and growth are important factors for accurately estimating nodule malignancy risk in lung cancer screening CT examinations. In this work, we aimed to develop a deep learning (DL) algorithm that uses a current and a prior CT examination to estimate the malignancy risk of pulmonary nodules. - + Methods or Background: We developed a dual time-point DL algorithm by stacking the nodules from the current and prior CT examinations in the input channels of convolutional neural networks. We used 3,011 nodules (286 malignant) and 994 nodules (73 malignant) as development and hold-out test cohorts from the National Lung Screening Trial, respectively. The reference standard was set by histopathologic confirmation or CT follow-up of more than two years. We compared the performance of the algorithm against PanCan model 2b and a previously published single time-point DL algorithm that only processed a single CT examination. We used the area under the receiver operating characteristic curve (AUC) to measure discrimination performance and a standard permutation test with 10,000 random permutations to compute p-values. - + Results or Findings: The dual time-point DL algorithm achieved an AUC of 0.94 (95% CI: 0.91 - 0.97) on the hold-out test cohort. The algorithm outperformed the single time-point DL algorithm and the PanCan model, which had AUCs of 0.92 (95% CI: 0.89 - 0.95; p = 0.055) and 0.88 (95% CI: 0.85 - 0.91; p < 0.001), respectively. - + Conclusion: Deep learning algorithms using current and prior CT examinations have the potential to accurately estimate the malignancy risk of pulmonary nodules. - + Limitations: External validation is needed on other screening datasets to generate further evidence. - + Ethics committee approval: Institutional review board approval was obtained at each of the 33 centers involved in the NLST. - + Funding for this study: Research grant from MeVis Medical Solutions AG.}, optnote = {DIAG, RADIOLOGY}, year = {2022}, @@ -31955,16 +31955,16 @@ @article{Venk23 algorithm = {https://grand-challenge.org/algorithms/temporal-nodule-analysis/}, abstract = {Background Prior chest CT provides valuable temporal information (eg, changes in nodule size or appearance) to accurately estimate malignancy risk. - + Purpose To develop a deep learning (DL) algorithm that uses a current and prior low-dose CT examination to estimate 3-year malignancy risk of pulmonary nodules. - + Materials and Methods In this retrospective study, the algorithm was trained using National Lung Screening Trial data (collected from 2002 to 2004), wherein patients were imaged at most 2 years apart, and evaluated with two external test sets from the Danish Lung Cancer Screening Trial (DLCST) and the Multicentric Italian Lung Detection Trial (MILD), collected in 2004-2010 and 2005-2014, respectively. Performance was evaluated using area under the receiver operating characteristic curve (AUC) on cancer-enriched subsets with size-matched benign nodules imaged 1 and 2 years apart from DLCST and MILD, respectively. The algorithm was compared with a validated DL algorithm that only processed a single CT examination and the Pan-Canadian Early Lung Cancer Detection Study (PanCan) model. - + Results The training set included 10 508 nodules (422 malignant) in 4902 trial participants (mean age, 64 years +- 5 [SD]; 2778 men). The size-matched external test sets included 129 nodules (43 malignant) and 126 nodules (42 malignant). The algorithm achieved AUCs of 0.91 (95% CI: 0.85, 0.97) and 0.94 (95% CI: 0.89, 0.98). It significantly outperformed the DL algorithm that only processed a single CT examination (AUC, 0.85 [95% CI: 0.78, 0.92; P = .002]; and AUC, 0.89 [95% CI: 0.84, 0.95; P = .01]) and the PanCan model (AUC, 0.64 [95% CI: 0.53, 0.74; P < .001]; and AUC, 0.63 [95% CI: 0.52, 0.74; P < .001]). - + Conclusion A DL algorithm using current and prior low-dose CT examinations was more effective at estimating 3-year malignancy risk of pulmonary nodules than established models that only use a single CT examination.}, citation-count = {0}, @@ -31984,12 +31984,12 @@ @conference{Vent20 booktitle = ARVO, title = {Estimating Uncertainty of Deep Neural Networks for Age-related Macular Degeneration Grading using Optical Coherence Tomography}, abstract = {Purpose: Deep convolutional neural networks (CNNs) are increasingly being used for eye disease screening and diagnosis. Especially the best performing variants, however, are generally overconfident in their predictions. For usefulness in clinical practice and increasing clinicians' trust on the estimated diagnosis, well-calibrated uncertainty estimates are necessary. We present a method for providing confidence scores of CNNs for age-related macular degeneration (AMD) grading in optical coherence tomography (OCT). - - + + Methods: 1,264 OCT volumes from 633 patients from the European Genetic Database (EUGENDA) were graded as one of five stages of AMD (No AMD, Early AMD, Intermediate AMD, Advanced AMD: GA, and Advanced AMD: CNV). Ten different 3D DenseNet-121 models that take a full OCT volume as input were used to predict the corresponding AMD stage. These networks were all trained on the same dataset. However, each of these networks were initialized differently. The class with the maximum average softmax output of these models was used as the final prediction. The confidence measure was the normalized average softmax output for that class. - + Results: The algorithm achieved an area under the Receiver Operating Characteristic of 0.9785 and a quadratic-weighted kappa score of 0.8935. The mean uncertainty, calculated as 1 - the mean confidence score, for incorrect predictions was 1.9 times as high as the mean uncertainty for correct predictions. When only using the probability output of a single network, this ratio was 1.4. Another measure for uncertainty estimation performance is the Expected Calibration Error (ECE), where a lower value is better. When comparing the method to the probability output of a single network, the ECE improved from 0.0971 to 0.0324. Figure 1 shows examples of both confident and unconfident predictions. - + Conclusions: We present a method for improving uncertainty estimation for AMD grading in OCT, by combining the output of multiple individually trained CNNs. This increased reliability of system confidences can contribute to building trust in CNNs for retinal disease screening. Furthermore, this technique is a first step towards selective prediction in retinal disease screening, where only cases with high uncertainty predictions need to be referred for expert evaluation.}, optnote = {DIAG, RADIOLOGY}, year = {2020}, @@ -32022,11 +32022,11 @@ @conference{Vent21a url = {https://iovs.arvojournals.org/article.aspx?articleid=2775505}, title = {Making AI Transferable Across OCT Scanners from Different Vendors}, abstract = {Purpose: Deep neural networks (DNNs) for optical coherence tomography (OCT) classification have been proven to work well on images from scanners that were used during training. However, since the appearance of OCT scans can differ greatly between vendors, these DNNs often fail when they are applied to scans from different manufacturers. We propose a DNN architecture for age-related macular degeneration (AMD) grading that maintains performance on OCTs from vendors not included during training. - + Methods: 2,598 and 680 Heidelberg Spectralis OCT scans from the European Genetic Database were used for development and testing, respectively. We tested transferability with 339 AMD-enriched Topcon OCTs from the Rotterdam Study. AMD severity classification was determined manually in accordance with the Cologne Image Reading Center and Laboratory and Rotterdam Classification, respectively. Classifications were harmonized for the evaluation of the DNNs. The proposed DNN considers each B-scan separately using a 2D ResNet-18, and internally combines the intermediate outputs related to each B-scan using a multiple instance learning approach. Even though the proposed DNN provides both B-scan level and OCT-volume level decisions, the architecture is trained end-to-end using only full volume gradings. This specific architecture makes our method robust to the variability of scanning protocols across vendors, as it is invariant to B-scan spacing. We compare this approach to a baseline that classifies the full OCT scan directly using a 3D ResNet-18. - + Results: The quadratic weighted kappa (QWK) for the baseline method dropped from 0.852 on the Heidelberg Spectralis dataset to 0.523 on the Topcon dataset. This QWK drop was smaller (p = 0.001) for our approach, which dropped from 0.849 to 0.717. The difference in area under the Receiver Operating Characteristic (AUC) drop was also smaller (p < 0.001) for our approach (0.969 to 0.906, -6.5%) than for the baseline method (0.971 to 0.806, -17.0%). - + Conclusions: We present a DNN for AMD classification on OCT scans that transfers well to scans from vendors that were not used for development. This alleviates the need for retraining on data from these scanner types, which is an expensive process in terms of data acquisition, model development, and human annotation time. Furthermore, this increases the applicability of AI for OCT classification in broader scopes than the settings in which they were developed.}, optnote = {DIAG, RADIOLOGY}, year = {2021}, @@ -32197,16 +32197,16 @@ @article{Vina23 abstract = {Abstract Objective Intra-oral scans and gypsum cast scans (OS) are widely used in orthodontics, prosthetics, implantology, and orthognathic surgery to plan patient-specific treatments, which require teeth segmentations with high accuracy and resolution. Manual teeth segmentation, the gold standard up until now, is time-consuming, tedious, and observer-dependent. This study aims to develop an automated teeth segmentation and labeling system using deep learning. - + Material and methods As a reference, 1750 OS were manually segmented and labeled. A deep-learning approach based on PointCNN and 3D U-net in combination with a rule-based heuristic algorithm and a combinatorial search algorithm was trained and validated on 1400 OS. Subsequently, the trained algorithm was applied to a test set consisting of 350 OS. The intersection over union (IoU), as a measure of accuracy, was calculated to quantify the degree of similarity between the annotated ground truth and the model predictions. - + Results The model achieved accurate teeth segmentations with a mean IoU score of 0.915. The FDI labels of the teeth were predicted with a mean accuracy of 0.894. The optical inspection showed excellent position agreements between the automatically and manually segmented teeth components. Minor flaws were mostly seen at the edges. - + Conclusion The proposed method forms a promising foundation for time-effective and observer-independent teeth segmentation and labeling on intra-oral scans. - + Clinical significance Deep learning may assist clinicians in virtual treatment planning in orthodontics, prosthetics, implantology, and orthognathic surgery. The impact of using such models in clinical practice should be explored.}, citation-count = {0}, @@ -32553,16 +32553,16 @@ @conference{Vree15a year = {2015}, abstract = {PURPOSE The purpose of this study was to evaluate the visibility of MR screen detected cancers on prior MR examinations in a population with an elevated risk for breast cancer. - + METHOD AND MATERIALS An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was conducted at our academic center. We selected all cases detected in MRI with a prior negative MR examination performed between 6 and 24 months before a cancer was revealed (mean: 12.8 A-A?A 1/2 3.7 months). This yielded 43 cancers (3 invasive lobular-, 33 invasive ductal carcinomas, 5 ductal carcinoma in situ and 2 others) in 41 patients (age: 49 A-A?A 1/2 9.8 years, 21 BRCA patients). The MR scans where the cancers were detected (diagnostic MR scan) and the prior MR scans were evaluated side-by-side in consensus by two dedicated breast radiologists. The visibility of the cancers on prior scans was rated as: visible (BIRADS 4/5), minimal sign (BIRADS 2/3), or invisible (BIRADS 1). Chi-square tests were used to test the correlation between patient and cancer characteristics, image quality (IQ), background parenchymal enhancement (BPE), and visibility of the tumor in the prior MR scan. - + RESULTS All lesions were retrospectively evident on the diagnostic MR scan. Review of the prior examinations of the 43 cancers detected in follow-up rounds revealed that 11 lesions (26%) were visible in the prior MRI and should have been recalled at the time of this scan. 15 lesions (35%) showed a minimal sign in the prior MRI. Only 17 lesions (40%) were completely invisible. High grade, ER negative, and PR negative tumors were more often invisible in the prior scan (p=0.016, p=0.005, and p=0.002). Moreover, tumors in BRCA patients were more likely to be invisible in the prior scan, than in non-BRCA carriers (p=0.025). IQ and BPE were not significantly related to the visibility of tumors in the prior scan. - + CONCLUSION About 26% of the breast cancers could have been recalled earlier and only 40% of the breast cancers were invisible in retrospect. - + CLINICAL RELEVANCE/APPLICATION To prevent screening errors regular auditing of clinical practice is indicated. Moreover, like in mammography, structural double reading of MRI screening examinations may be recommended.}, optnote = {DIAG, RADIOLOGY}, @@ -32575,16 +32575,16 @@ @conference{Vree15b year = {2015}, abstract = {PURPOSE Breast cancer screening in women at elevated risk is performed with yearly MRI and mammography. This includes women with BRCA mutations and women at elevated risk for other causes (mainly family history). The purpose of this study was to assess differences between BRCA mutation carriers and non-BRCA patients in a longitudinal MRI screening program in terms of recall rate, positive predictive value, and detection. - + METHOD AND MATERIALS An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was performed at our academic center. We analysed 9.504 screening MR examinations in 2843 women (age: 45 A-A?A 1/2 12.09 years), including 761 BRCA patients, and 2082 non-BRCA patients. Recall rate (RR), positive predictive value (PPV), and cancer detection rate (CDR) were evaluated for first round examinations and follow-up examinations separately. BRCA patients were compared with non-BRCA patients. Chi-square tests were used to determine statistical significance. - + RESULTS The RR for BRCA patients in the first round of screening was 86.07 per 1000 examinations and 52.58 per 1000 examinations in non-BRCA patients (p<0.001). The PPV for BRCA patients in the first round of screening was found to be 0.44, compared to 0.50 in non-BRCA patients (p=0.013). The CDR was 38.25 per 1000 examinations for BRCA patients and 26.53 per 1000 examinations for non-BRCA patients (p<0.001). In follow up, the RR was found to be 24.92 per 1000 examinations for BRCA patients and 22.81 per 1000 examinations for non-BRCA patients (p<0.001). The PPV was 0.46 for BRCA patients and 0.21 for non-BRCA patients (p<0.001). CDR was 11.42 per 1000 examinations for BRCA patients and 4.86 per 1000 examinations for non-BRCA patients (p<0.001). - + CONCLUSION RR and CDR are high for all patients in the first round. RR and CDR significantly decreased in follow-up rounds (p<0.001). PPV remained at an acceptable level for both patient groups, and remains particularly high in BRCA carriers. RR, PPV, and CDR differed significantly between BRCA and non-BRCA patients in both first and follow up rounds. - + CLINICAL RELEVANCE/APPLICATION These results underline that MRI is an excellent tool for screening high risk patients. Cancer detection is very high in the first round in all patients, but remains high only in BRCA carriers in follow up rounds.}, optnote = {DIAG, RADIOLOGY}, @@ -32613,7 +32613,7 @@ @conference{Vree16 4.A.J. Maxwell et al. A study of breast cancers detected in the incident round of the UK NHS Breast Screening Programme: the importance of early detection and treatment of ductal carcinoma in situ. Breast (2001), 10(5):392-8 5.A. Gubern-Merida et al. Automated localization of breast cancer in DCE-MRI. Med Imag Anal (2015),20(1):265-74 6.BI-RADS Atlas, 5th ed 2013 - + Acknowledgements: European Unions 7FP (Grant 601040)}, optnote = {DIAG, RADIOLOGY}, } @@ -33319,16 +33319,16 @@ @article{Wink21a abstract = {Abstract Objectives Digital breast tomosynthesis (DBT) increases sensitivity of mammography and is increasingly implemented in breast cancer screening. However, the large volume of images increases the risk of reading errors and reading time. This study aims to investigate whether the accuracy of breast radiologists reading wide-angle DBT increases with the aid of an artificial intelligence (AI) support system. Also, the impact on reading time was assessed and the stand-alone performance of the AI system in the detection of malignancies was compared to the average radiologist. - + Methods A multi-reader multi-case study was performed with 240 bilateral DBT exams (71 breasts with cancer lesions, 70 breasts with benign findings, 339 normal breasts). Exams were interpreted by 18 radiologists, with and without AI support, providing cancer suspicion scores per breast. Using AI support, radiologists were shown examination-based and region-based cancer likelihood scores. Area under the receiver operating characteristic curve (AUC) and reading time per exam were compared between reading conditions using mixed-models analysis of variance. - + Results On average, the AUC was higher using AI support (0.863 vs 0.833; p = 0.0025). Using AI support, reading time per DBT exam was reduced (p < 0.001) from 41 (95% CI = 39-42 s) to 36 s (95% CI = 35- 37 s). The AUC of the stand-alone AI system was non-inferior to the AUC of the average radiologist (+0.007, p = 0.8115). - + Conclusions Radiologists improved their cancer detection and reduced reading time when evaluating DBT examinations using an AI reading support system. - + Key Points * Radiologists improved their cancer detection accuracy in digital breast tomosynthesis (DBT) when using an AI system for support, while simultaneously reducing reading time. * The stand-alone breast cancer detection performance of an AI system is non-inferior to the average performance of radiologists for reading digital breast tomosynthesis exams. @@ -33757,15 +33757,15 @@ @conference{Zeel19 title = {{EyeNED} workstation: Development of a multi-modal vendor-independent application for annotation, spatial alignment and analysis of retinal images}, abstract = {Purpose: Researchers and specialists in the field of ophthalmology currently rely on suboptimal vendor-specific software solutions for viewing and annotating retinal images. Our goal was to develop a fully-featured vendor-independent application that allows researchers and specialists to visualize multi-modal retinal images, perform spatial alignment and annotations, and review outputs of artificial intelligence (AI) algorithms. - + Methods: The application consists of a web-based front-end that allows users to analyze baseline and follow-up images in a multi-modal viewer. It communicates with a back-end interface for grader authentication, loading and storing of images and annotation data. Several types of annotation techniques are available, ranging from image-level classification to point-based and region-based lesion-level annotations. - + The user can select color fundus (CF) images, optical coherence tomography (OCT) volumes, infrared (IR) and autofluorescence (AF) images to be shown simultaneously in the viewer. Spatial alignment of the different modalities can be performed using an integrated affine registration method by clicking on corresponding landmarks, after which a synchronized cursor will appear. After several graders have annotated lesions, the application can be used to compare these and create a consensus grading. - + Results : The application was used by graders and researchers in the EyeNED research group. Region based annotations of geographic atrophy were made for 313 studies containing 488 CF images and 68 OCT images; and of drusen in 100 OCT b-scans. Semi-automatic annotation of the area of central retinal atrophy in Stargardt disease was performed for 67 AF images. Point-based annotation was carried out on lesions in 50 CF images of diabetic retinopathy patients. The multimodal viewing and localisation of lesions was perceived as particularly helpful in the grading of lesions and consensus discussions. - + Conclusions : A software solution has been developed to assist researchers and specialists to view and annotate retinal images. The application was successfully used for annotating lesions in various imaging modalities, facilitating the grading of images in large studies and the collection of annotations for AI solutions.}, optnote = {DIAG, RADIOLOGY}, @@ -33805,13 +33805,13 @@ @article{Zels15 doi = {10.1016/j.acra.2015.08.006}, abstract = {RATIONALE AND OBJECTIVES: To investigate the value of multiplanar reconstructions (MPRs) of automated three-dimensional (3D) breast ultrasound (ABUS) compared to transverse evaluation only, in differentiation of benign and malignant breast lesions. - + MATERIALS AND METHODS: Five breast radiologists evaluated ABUS scans of 96 female patients with biopsy-proven abnormalities (36 malignant and 60 benign). They classified the most suspicious lesion based on the breast imaging reporting and data system (BI-RADS) lexicon using the transverse scans only. A likelihood-of-malignancy (LOM) score (0-100) and a BI-RADS final assessment were assigned. Thereafter, the MPR was provided and readers scored the cases again. In addition, they rated the presence of spiculation and retraction in the coronal plane on a five-point scale called Spiculation and Retraction Severity Index (SRSI). Reader performance was analyzed with receiver-operating characteristics analysis. - + RESULTS: The area under the curve increased from 0.82 to 0.87 (P = .01) after readers were shown the reconstructed planes. The SRSI scores are highly correlated (Spearman's r) with the final LOM scores (range, r = 0.808-0.872) and DLOM scores (range, r = 0.525-0.836). Readers downgraded 3%-18% of the biopsied benign lesions to BI-RADS 2 after MPR evaluation. Inter-reader agreement for SRSI was substantial (intraclass correlation coefficient, 0.617). Inter-reader agreement of the BI-RADS final assessment improved from 0.367 to 0.536 after MPRs were read. - + CONCLUSIONS: Full 3D evaluation of ABUS using MPR improves differentiation of breast lesions in comparison to evaluating only transverse planes. Results suggest that the added value of MPR might be related to visualization of spiculation and retraction patterns in the coronal reconstructions.}, file = {Zels15.pdf:pdf\\Zels15.pdf:PDF},