-
Notifications
You must be signed in to change notification settings - Fork 856
/
Copy pathnlp_spark.py
105 lines (91 loc) · 3.04 KB
/
nlp_spark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.preprocess.spark import make_spark_preprocessor
from .nlp import (
BaseNLPLabelingFunction,
SpacyPreprocessorParameters,
base_nlp_labeling_function,
)
class SparkNLPLabelingFunction(BaseNLPLabelingFunction):
r"""Special labeling function type for SpaCy-based LFs running on Spark.
This class is a Spark-compatible version of ``NLPLabelingFunction``.
See ``NLPLabelingFunction`` for details.
Parameters
----------
name
Name of the LF
f
Function that implements the core LF logic
resources
Labeling resources passed in to ``f`` via ``kwargs``
pre
Preprocessors to run before SpacyPreprocessor is executed
text_field
Name of data point text field to input
doc_field
Name of data point field to output parsed document to
language
SpaCy model to load
See https://spacy.io/usage/models#usage
disable
List of pipeline components to disable
See https://spacy.io/usage/processing-pipelines#disabling
memoize
Memoize preprocessor outputs?
memoize_key
Hashing function to handle the memoization (default to snorkel.map.core.get_hashable)
gpu
Prefer Spacy GPU processing?
Raises
------
ValueError
Calling incorrectly defined preprocessors
Attributes
----------
name
See above
"""
@classmethod
def _create_preprocessor(
cls, parameters: SpacyPreprocessorParameters
) -> SpacyPreprocessor:
preprocessor = SpacyPreprocessor(**parameters._asdict())
make_spark_preprocessor(preprocessor)
return preprocessor
class spark_nlp_labeling_function(base_nlp_labeling_function):
"""Decorator to define a SparkNLPLabelingFunction object from a function.
Parameters
----------
name
Name of the LF
resources
Labeling resources passed in to ``f`` via ``kwargs``
pre
Preprocessors to run before SpacyPreprocessor is executed
text_field
Name of data point text field to input
doc_field
Name of data point field to output parsed document to
language
SpaCy model to load
See https://spacy.io/usage/models#usage
disable
List of pipeline components to disable
See https://spacy.io/usage/processing-pipelines#disabling
memoize
Memoize preprocessor outputs?
memoize_key
Hashing function to handle the memoization (default to snorkel.map.core.get_hashable)
Example
-------
>>> @spark_nlp_labeling_function()
... def has_person_mention(x):
... person_ents = [ent for ent in x.doc.ents if ent.label_ == "PERSON"]
... return 0 if len(person_ents) > 0 else -1
>>> has_person_mention
SparkNLPLabelingFunction has_person_mention, Preprocessors: [SpacyPreprocessor...]
>>> from pyspark.sql import Row
>>> x = Row(text="The movie was good.")
>>> has_person_mention(x)
-1
"""
_lf_cls = SparkNLPLabelingFunction