forked from argimenes/standoff-properties-editor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.cypher
35 lines (35 loc) · 1.63 KB
/
query.cypher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// Beginn bei /TEI/text/body/*
// get complete text first
match path=(d:XmlDocument)-[:NE*]->(e:XmlCharacters)
where not (e)-[:NE]->()
with tail(nodes(path)) as words, d
with reduce(s="", x in words| s + x.text ) as allText, d
// traverse the XML structure down to each tag node
match (d)<-[:IS_CHILD_OF]-(:XmlTag{_name:'TEI'})<-[:IS_CHILD_OF]-(:XmlTag{_name:'text'})<-[:IS_CHILD_OF]-(base:XmlTag{_name:'body'})
call apoc.path.expandConfig(base,{
relationshipFilter: '<IS_CHILD_OF',
labelFilter: 'XmlTag',
bfs: false,
minLevel: 1
}) yield path
with allText, nodes(path)[-1] as this
// follow the tag for the longest loop with LAST_CHILD_OF -> contains text stream underneath this tag
MATCH p=(this)-[:NEXT*]->(x)
where (x)-[:LAST_CHILD_OF*]->(this) and any(x in nodes(p) WHERE x:XmlCharacters)
with allText, this, collect(p)[-1] as longest
with allText, this, [x in nodes(longest) where x:XmlCharacters] as xmlCharacters
with allText, this,
apoc.coll.min([x in xmlCharacters | x.startIndex]) as min,
apoc.coll.max([x in xmlCharacters | x.endIndex]) as max,
apoc.text.join([x in xmlCharacters | x.text], "") as text
with allText, {
index:id(this),
startIndex: min,
endIndex: max,
text: text,
type: this._name,
// attributes: apoc.map.fromPairs([x in keys(this) WHERE not x starts with "_" | [x, this[x]] ])
// amending a fixed dummy attribute to prevent empty maps (which cause issues in cypher-shell)
attributes: apoc.map.setKey(apoc.map.fromPairs([x in keys(this) WHERE not x starts with "_" | [x, this[x]] ]), "__dummy", 1)
} as standoffProperty
return {text: allText, properties: collect(standoffProperty)} as json;