CentreForDigitalHumanities · lukavdplas · Nov 20, 2024 · Oct 22, 2024 · Nov 14, 2024 · BeritJanssen
diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -10,6 +10,7 @@
 import logging
 import traceback
 from typing import Any, Dict, Callable, Union, List, Optional, Iterable
+import warnings
 
 import bs4
 import html
@@ -28,25 +29,37 @@ class Extractor(object):
     An extractor contains a method that can be used to gather data for a field. 
 
     Parameters:
-        applicable: optional predicate that takes metadata and decides whether this
-            extractor is applicable. If left as `None`, the extractor is always
-            applicable.
-        transform: optional function to transform the postprocess the extracted value.
+        applicable: 
+            optional argument to check whether the extractor can be used. This should
+            be another extractor, which is applied first; the containing extractor
+            is only applied if the result is truthy. Any extractor can be used, as long as
+            it's supported by the Reader in which it's used. If left as `None`, this 
+            extractor is always applicable.
+        transform: optional function to transform or postprocess the extracted value.
     '''
 
     def __init__(self,
-                 applicable: Optional[Callable[[Dict], bool]] = None,
+                 applicable: Union['Extractor', Callable[[Dict], bool], None] = None,
                  transform: Optional[Callable] = None
                  ):
+
+        if callable(applicable):
+            warnings.warn(
+                'Using a callable as "applicable" argument is deprecated; provide an '
+                'Extractor instead',
+                DeprecationWarning,
+            )
+
         self.transform = transform
         self.applicable = applicable
 
+
     def apply(self, *nargs, **kwargs):
         '''
         Test if the extractor is applicable to the given arguments and if so,
         try to extract the information.
         '''
-        if self.applicable is None or self.applicable(kwargs.get('metadata')):
+        if self._is_applicable(*nargs, **kwargs):
             result = self._apply(*nargs, **kwargs)
             try:
                 if self.transform:
@@ -73,6 +86,31 @@ def _apply(self, *nargs, **kwargs):
         raise NotImplementedError()
 
 
+    def _is_applicable(self, *nargs, **kwargs) -> bool:
+        '''
+        Checks whether the extractor is applicable, based on the condition passed as the
+        `applicable` parameter.
+
+        If no condition is provided, this is always true. If the condition is an
+        Extractor, this checks whether the result is truthy.
+
+        If the condition is a callable, it will be called with the document metadata as
+        an argument. This option is deprecated; you can use the Metadata extractor to
+        replace it.
+
+        Raises:
+            TypeError: Raised if the applicable parameter is an unsupported type.
+        '''
+        if self.applicable is None:
+            return True
+        if isinstance(self.applicable, Extractor):
+            return bool(self.applicable.apply(*nargs, **kwargs))
+        if callable(self.applicable):
+            return self.applicable(kwargs.get('metadata'))
+        return TypeError(
+            f'Unsupported type for "applicable" parameter: {type(self.applicable)}'
+        )
+
 class Choice(Extractor):
     '''
     Use the first applicable extractor from a list of extractors.
@@ -89,8 +127,21 @@ class Choice(Extractor):
     This would extract `'foo'` if `some_condition` is met; otherwise,
     the extracted value will be `'bar'`.
 
-    Note the difference with `Backup`: `Choice` is based on _metadata_, rather than the
-    extracted value.
+    Note the difference with `Backup`: `Backup` will select the first truthy value from a
+    list of extractors, but `Choice` only checks the `applicable` condition. For example:
+
+        Choice(
+            CSV('foo', applicable=Metadata('bar')),
+            CSV('baz'),
+        )
+
+        Backup(
+            CSV('foo', applicable=Metadata('bar')),
+            CSV('baz'),
+        )
+
+    These extractors behave differently if the "bar" condition holds, but the "foo" field
+    is empty. `Backup` will try to extract the "baz" field, `Choice` will not.
 
     Parameters:
         *extractors: extractors to choose from. These should be listed in descending
@@ -102,10 +153,10 @@ def __init__(self, *extractors: Extractor, **kwargs):
         self.extractors = list(extractors)
         super().__init__(**kwargs)
 
-    def _apply(self, metadata: Dict, *nargs, **kwargs):
+    def _apply(self, *nargs, **kwargs):
         for extractor in self.extractors:
-            if extractor.applicable is None or extractor.applicable(metadata):
-                return extractor.apply(metadata=metadata, *nargs, **kwargs)
+            if extractor._is_applicable(*nargs, **kwargs):
+                return extractor.apply(*nargs, **kwargs)
         return None
 
 
@@ -149,8 +200,8 @@ class Backup(Extractor):
     Since the first extractor returns `None`, the second extractor will be used, and the 
     extracted value would be `'foo'`.
 
-    Note the difference with `Choice`: `Backup` is based on the _extracted value_, rather
-    than metadata.
+    Note the difference with `Choice`: `Backup` is based on the _extracted value_,
+    `Choice` on the `applicable` parameter of each extractor.
 
     Parameters:
         *extractors: extractors to use. These should be listed in descending order of

diff --git a/tests/test_generic_extractors.py b/tests/test_generic_extractors.py
@@ -1,3 +1,4 @@
+import pytest
 from ianalyzer_readers.extract import (
     Constant, Combined, Backup, Choice, Metadata, Pass, Order
 )
@@ -32,7 +33,7 @@ def test_choice_extractor():
     extractor = Choice(
         Constant(
             'first',
-            applicable=lambda metadata: metadata.get('check'),
+            applicable=Metadata('check'),
         ),
         Constant(
             'second'
@@ -70,4 +71,17 @@ def test_pass_extractor():
 def test_order_extractor():
     extractor = Order()
     output = extractor.apply(index=1)
-    assert output == 1
+    assert output == 1
+
+
+def test_extractor_applicable_extractor():
+    extractor = Constant('test', applicable=Metadata('testing'))
+    assert extractor.apply(metadata={'testing': True}) == 'test'
+    assert extractor.apply(metadata={'testing': False}) == None
+
+
+def test_extractor_applicable_callable():
+    with pytest.warns(DeprecationWarning):
+        extractor = Constant('test', applicable=lambda metadata: metadata['testing'])
+    assert extractor.apply(metadata={'testing': True}) == 'test'
+    assert extractor.apply(metadata={'testing': False}) == None