-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpatterns.py
119 lines (109 loc) · 3.24 KB
/
patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# ruff: noqa: E501
# Phones
delimiters = ["", r"\.", r"\-", " "]
phone_pattern = (
r"("
r"(?<!\d[ .-]{,3})\b"
r"((?:(?:\+|00)33\s?[.]\s?(?:\(0\)\s?[.]\s?)?|0)[1-9](?:(?:[.]\d{2}){4}|\d{2}(?:[.]\d{3}){2})(?![\d])"
r"|(?:(?:\+|00)33\s?[-]\s?(?:\(0\)\s?[-]\s?)?|0)[1-9](?:(?:[-]\d{2}){4}|\d{2}(?:[-]\d{3}){2})(?![\d])"
r"|(?:(?:\+|00)33\s?[-]\s?(?:\(0\)\s)?|0)[1-9](?:(?:[ ]?\d{2}){4}|\d{2}(?:[ ]?\d{3}){2})(?![\d]))"
r"\b(?![ .-]{,3}\d)"
r")"
)
# IPP
ipp_pattern = r"(" r"(?<!\d[ .-]{,3})\b" r"(8(\d ?){9})" r"\b(?![ .-]{,3}\d)" r")"
# NDA
nda_pattern = r"""(?x)
(?<=(?:
(?:(?i:
(?:(?:no|n°|numero|no\s+d[e'‘]|n°\s+d[e'‘]|numero\s+d[e'‘])\s+)?
(?:examen|demande|sejour|dossier)
))
|(?:Examen|Demande|Sejour)
)\s*:?\s*)
\b
(
\d{2,}[A-Z]?[A-Z]?\d*(?:[-]\d+)?
|\d*[A-Z]?[A-Z]?\d{2,}(?:[-]\d+)?
)
\b
(?![-/+\\_])
"""
# Mail
mail_pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?: ?\. ?[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*") ?@ ?(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])? ?\. ?)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) ?\. ?){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
# NSS
nss_pattern = r"""(?x)
# No digits just before on the same line
(?<!\d[ .-/+=_]{,3})\b
(
# Sex
(?:[1-2])[ ]?
# Year of birth
(?:([0-9][ ]?){2})[ ]?
# Month of birth
(?:0[ ]?[0-9]|[2-35-9][ ]?[0-9]|[14][ ]?[0-2])[ ]?
# Location of birth
(?:
(?:
0[ ]?[1-9]
|[1-8][ ]?[0-9]
|9[ ]?[0-69]
|2[ ]?[abAB]
)[ ]?
(?:
0[ ]?0[ ]?[1-9]|0[ ]?[1-9][ ]?[0-9]|
[1-8][ ]?([0-9][ ]?){2}|9[ ]?[0-8][ ]?[0-9]|9[ ]?9[ ]?0
)
|(?:9[ ]?[78][ ]?[0-9])[ ]?(?:0[ ]?[1-9]|[1-8][ ]?[0-9]|9 ?0)
)[ ]?
# Birth number 001-999
(?:0[ ]?0[ ]?[1-9]|0[ ]?[1-9][ ]?[0-9]|[1-9][ ]?([0-9][ ]?){2})[ ]?
# Control key
(?:0[ ]?[1-9]|[1-8][ ]?[0-9]|9[ ]?[0-7])
|
# Temporary NSS
[3478][ ]?(?:[0-9][ ]?){14}
)
# Not followed by digits on the same line
\b(?![ .-]{,3}\d)
"""
# PERSON (FIRSTNAME AND LASTNAME)
Xxxxx = r"[A-Z]\p{Ll}+"
XXxX_ = r"[A-Z][A-Z\p{Ll}-]"
sep = r"(?:[ ]*|-)?"
person_patterns = [
rf"""(?x)
(?<![/+])
\b
(?:[Dd]r[.]?|[Dd]octeur|[mM]r?[.]?|[Ii]nterne[ ]?:|[Ee]xterne[ ]?:|[Mm]onsieur|[Mm]adame|[Rr].f.rent[ ]?:|[P]r[.]?|[Pp]rofesseure?|[Mm]me[.]?|[Ee]nfant|[Mm]lle)[ ]+
(?:
(?P<LN0>[A-Z][A-Z](?:{sep}(?:ep[.]|de|[A-Z]+))*)[ ]+(?P<FN0>{Xxxxx}(?:{sep}{Xxxxx})*)
|(?P<FN1>{Xxxxx}(?:{sep}{Xxxxx})*)[ ]+(?P<LN1>[A-Z][A-Z]+(?:{sep}(?:ep[.]|de|[A-Z]+))*)
|(?P<LN3>{Xxxxx}(?:(?:-|[ ]de[ ]|[ ]ep[.][ ]){Xxxxx})*)[ ]+(?P<FN2>{Xxxxx}(?:-{Xxxxx})*)
|(?P<LN2>{XXxX_}+(?:{sep}{XXxX_}+)*)
)
\b(?![/+])
""",
rf"""
\b
(?<![/+%])
(?P<FN0>[A-Z][.])\s+(?P<LN0>{XXxX_}+(?:{sep}{XXxX_}+)*)
\b(?![/+%])
""",
]
# RPPS (doctor identifier)
# Matches cases like:
# - N° RPPS : 10009876543
# - or RPPS = 10009876543
rpps_pattern = r"""(?xi)
(?<=\b(?:RPPS\s*[:=]?\s*))
\d{7,11}
"""
patterns = dict(
IPP=ipp_pattern,
MAIL=mail_pattern,
TEL=phone_pattern,
NDA=nda_pattern,
SECU=nss_pattern,
RPPS=rpps_pattern,
)