Skip to content

Commit

Permalink
Added Tree datastructure for dependency tree
Browse files Browse the repository at this point in the history
Updated readme
  • Loading branch information
ArthurDevNL committed Jan 17, 2021
1 parent 355d6b9 commit d1efdcd
Show file tree
Hide file tree
Showing 10 changed files with 288 additions and 51 deletions.
6 changes: 2 additions & 4 deletions Conllu/Conllu/ConlluParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Conllu.Extensions;

namespace Conllu
{
Expand All @@ -21,10 +22,7 @@ public static IEnumerable<Sentence> ParseFile(string filePath)
/// <param name="text">The text to parse (should be in CoNLL-U format)</param>
/// <returns>An enumerable of sentences parsed from the text</returns>
public static IEnumerable<Sentence> ParseText(string text)
=> Parse(text.Split(
new[] {"\r\n", "\r", "\n"},
StringSplitOptions.None
));
=> Parse(text.SplitLines());

/// <summary>
/// Parses an enumerable of lines to an enumerable of sentences
Expand Down
9 changes: 9 additions & 0 deletions Conllu/Conllu/Extensions/StringExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
using System;
using System.Collections.Generic;

namespace Conllu.Extensions
{
internal static class StringExtensions
Expand All @@ -7,5 +10,11 @@ public static string ValueOrNull(this string x)

public static string ValueOrUnderscore(this string x)
=> x?.Trim().IsNullOrEmpty() == false ? x : "_";

public static IEnumerable<string> SplitLines(this string x)
=> x.Split(
new[] {"\r\n", "\r", "\n"},
StringSplitOptions.None
);
}
}
70 changes: 68 additions & 2 deletions Conllu/Conllu/Sentence.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Conllu.Enums;

namespace Conllu
Expand Down Expand Up @@ -29,10 +30,75 @@ public Sentence(List<Token> tokens, Dictionary<string, string> metadata = null)
Metadata = metadata ?? new Dictionary<string, string>();
}

/// <summary>
/// Constructs a dependency tree from the tokens in the sentence. It will only create the tree from the nodes that have a valid head and dependency relation
/// </summary>
/// <returns></returns>
public Tree<Token, DependencyRelation> AsDependencyTree()
{
// TODO
return null;
var map = RawTokens().ToDictionary(t => t.Id, t => new Tree<Token, DependencyRelation>(t));
Tree<Token, DependencyRelation> root = null;
foreach (var (id, node) in map)
{
if (node.Value.Head == 0 || node.Value.DepRelEnum == DependencyRelation.Root)
{
root = node;
continue;
}

if (node.Value.Head == null || !map.ContainsKey(node.Value.Head.Value) || node.Value.DepRelEnum == null)
continue;

var parent = map[node.Value.Head.Value];
parent.AddChild(node, node.Value.DepRelEnum.Value);
}

return root;
}

/// <summary>
/// Returns the list of tokens that make up the raw sentence (ie all non empty nodes and multi word tokens)
/// </summary>
public List<Token> RawTokens()
{
var ts = new List<Token>();
var tokenMap = Tokens.GroupBy(t => t.Id).ToDictionary(g => g.Key, g => g.ToList());
var i = 1;
while (i <= Tokens.Max(t => t.Id))
{
var t = tokenMap[i].First();
ts.Add(t);

if (t.IsMultiwordToken)
i = t.Identifier.SpanId.Value;
else
i += 1;
}

return ts;
}

/// <summary>
/// Returns the flat text of the sentence. It either returns the text from the metadata if available or a concatenation of the non empty nodes.
/// </summary>
public string RawTokenSequence()
{
if (Metadata.ContainsKey("text"))
return Metadata["text"];

var s = new StringBuilder();
var ts = RawTokens();
for (var i = 0; i < ts.Count; i++)
{
var t = ts[i];
s.Append(t.Form);

// Don't append space if MISC contains SpaceAfter=No or if it is the last token
if (!(t.Misc ?? "").ToLower().Contains("spaceafter=no") && i != ts.Count - 1)
s.Append(" ");
}

return s.ToString();
}

public string Serialize()
Expand Down
9 changes: 7 additions & 2 deletions Conllu/Conllu/Token.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace Conllu
{
public class Token
public class Token: IComparable<Token>
{
/// <summary>
/// The main ID of the token. Quick accessor to <see cref="Identifier"/>
Expand Down Expand Up @@ -87,7 +87,12 @@ public class Token
/// Whether the token is an empty node. Utility method for <see cref="Identifier"/>
/// </summary>
public bool IsEmptyNode => Identifier.IsEmptyNode;


public int CompareTo(Token other)
{
return Identifier.CompareTo(other.Identifier);
}

public override string ToString()
=> Form;

Expand Down
45 changes: 43 additions & 2 deletions Conllu/Conllu/TokenIdentifier.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System.Collections;
using System;

namespace Conllu
{
public class TokenIdentifier
public class TokenIdentifier: IComparable<TokenIdentifier>
{
/// <summary>
/// The ID of the token
Expand Down Expand Up @@ -56,5 +56,46 @@ public string Serialize()

return $"{Id}";
}

/// <summary>
/// Tokens are sorted by their ID. Between tokens with the same ID, the order is: multi word (by span id), ID and them empty nodes bu sub ID.
/// </summary>
public int CompareTo(TokenIdentifier other)
{
if (Id == other.Id)
{
if (IsMultiwordIndex)
{
if (other.IsMultiwordIndex && SpanId.HasValue && other.SpanId.HasValue)
return SpanId.Value.CompareTo(other.SpanId.Value);
return -1; // The other is not a multi word
}

if (IsEmptyNode)
{
if (other.IsEmptyNode && SubId.HasValue && other.SubId.HasValue)
return SubId.Value.CompareTo(other.SubId.Value);
return 1; // The other is not an empty node
}

return 0;
}

return Id.CompareTo(other.Id);
}

/// <summary>
/// Returns whether the id is the same or in the range of the span in case of a multi word index
/// </summary>
/// <param name="id"></param>
/// <returns></returns>
public bool IsInRange(int id)
{
if (!IsMultiwordIndex)
return id == Id;

// ReSharper disable once PossibleInvalidOperationException
return id >= Id && id <= SpanId.Value;
}
}
}
69 changes: 51 additions & 18 deletions Conllu/Conllu/Tree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

namespace Conllu
{
public class Tree<TVertex, TConnection>
public class Tree<TVertex, TConnection>: IComparable<Tree<TVertex, TConnection>>
where TVertex: IComparable<TVertex>
{
public SortedDictionary<Tree<TVertex, TConnection>, TConnection> Connections { get; set; }
public SortedDictionary<Tree<TVertex, TConnection>, TConnection> Connections { get; }

public IEnumerable<Tree<TVertex, TConnection>> Children => Connections.Keys;

Expand All @@ -18,44 +19,76 @@ public class Tree<TVertex, TConnection>
public Tree(TVertex value, SortedDictionary<Tree<TVertex, TConnection>, TConnection> connections = null)
{
Value = value;
Connections = connections;
Connections = connections ?? new SortedDictionary<Tree<TVertex, TConnection>, TConnection>();
}

public void AddChild(Tree<TVertex, TConnection> child, TConnection connection)
{
Connections.Add(child, connection);
}

/// <summary>
/// Traverses the tree depth-first to find elements matching the predicate.
/// </summary>
/// <param name="predicate">The predicate to fullfil</param>
/// <param name="predicate">The predicate to fulfill</param>
/// <returns>Any elements that match the predicate in a depth-first search manner</returns>
public IEnumerable<Tree<TVertex, TConnection>> WhereDfs(Func<TVertex, bool> predicate)
{
if (predicate(Value))
yield return this;
var s = new Stack<Tree<TVertex, TConnection>>();
s.Push(this);

foreach (var result in Connections.SelectMany(kvp => kvp.Key.WhereDfs(predicate)))
while (s.TryPop(out var node))
{
yield return result;
if (predicate(node.Value))
yield return node;

foreach (var child in node.Children)
s.Push(child);
}
}

/// <summary>
/// Traverses the tree breadth-first to find elements matching the predicate.
/// </summary>
/// <param name="predicate">The predicate to fulfill</param>
/// <returns>Any elements that match the predicate in a breadth-first search manner</returns>
public IEnumerable<Tree<TVertex, TConnection>> WhereBfs(Func<TVertex, bool> predicate)
{
if (predicate(Value))
yield return this;

// TODO
var s = new Queue<Tree<TVertex, TConnection>>();
s.Enqueue(this);

while (s.TryDequeue(out var node))
{
if (predicate(node.Value))
yield return node;

foreach (var child in node.Children)
s.Enqueue(child);
}
}

public override string ToString()
=> ToString(0);
/// <summary>
/// Compares the values to one another
/// </summary>
public int CompareTo(Tree<TVertex, TConnection> other)
{
return Value.CompareTo(other.Value);
}

private string ToString(int depth)
public override string ToString()
{
var s = $"{Value.ToString()}";
return Connections.Aggregate(s, (current, kvp) => current + kvp.Key.ToString(1, kvp.Value));
}

private string ToString(int depth, TConnection connection)
{
var prefix = new string('\t', depth);
if (IsLeaf)
return $"\n{prefix}{Value.ToString()}";
return $"\n{prefix}{Value.ToString()} ({connection})";

var result = depth == 0 ? $"{prefix}{Value.ToString()}" : $"\n{prefix}{Value.ToString()}";
return Children.Aggregate(result, (current, kvp) => current + kvp.ToString(depth + 1));
var result = depth == 0 ? $"{prefix}{Value.ToString()} ({connection})" : $"\n{prefix}{Value.ToString()} ({connection})";
return Connections.Aggregate(result, (current, kvp) => current + kvp.Key.ToString(depth + 1, kvp.Value));
}
}
}
46 changes: 45 additions & 1 deletion Conllu/ConlluTests/ConlluTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Reflection;
using Conllu;
using Conllu.Enums;
using Conllu.Extensions;
using NUnit.Framework;

namespace ConlluTests
Expand Down Expand Up @@ -78,6 +79,7 @@ public void TestParseLargeFile()
var result = Conllu.ConlluParser.ParseText(text).ToList();
Assert.AreEqual(2002, result.Count);
Assert.IsTrue(result.All(x => !x.IsEmpty()));
Assert.IsTrue(result.All(s => s.AsDependencyTree() != null));
}

[Test]
Expand All @@ -100,14 +102,16 @@ public void TestSerializeParse()
{ "text", "The quick brown fox jumps over the lazy dog."}
});
var serialized = ConlluParser.Serialize(new List<Sentence> {sentence});
var serializedLines = serialized.SplitLines();

// Compare to the file
var assembly = typeof(Tests).GetTypeInfo().Assembly;
var stream = assembly.GetManifestResourceStream("ConlluTests.Resources.TestSentence.conllu");
// ReSharper disable once AssignNullToNotNullAttribute
using var reader = new StreamReader(stream);
var text = reader.ReadToEnd();
Assert.AreEqual(text, serialized);
var textLines = text.SplitLines();
Assert.AreEqual(textLines, serializedLines);

// Re-parse
var parsed = ConlluParser.ParseText(text).ToList();
Expand All @@ -119,5 +123,45 @@ public void TestSerializeParse()
Assert.IsTrue(s.Metadata.ContainsKey("text"));
Assert.AreEqual("The quick brown fox jumps over the lazy dog.", s.Metadata["text"]);
}

[Test]
public void TestCreateParseTree()
{
var assembly = typeof(Tests).GetTypeInfo().Assembly;
var stream = assembly.GetManifestResourceStream("ConlluTests.Resources.TestSentence.conllu");
// ReSharper disable once AssignNullToNotNullAttribute
using var reader = new StreamReader(stream);
var text = reader.ReadToEnd();

var result = ConlluParser.ParseText(text).FirstOrDefault();
Assert.NotNull(result);
var tree = result.AsDependencyTree();
Assert.AreEqual(5, tree.Value.Id);
Assert.AreEqual("jumps", tree.Value.Form);
Assert.AreEqual(3, tree.Children.Count());
Assert.AreEqual(3, tree.Children.ToList()[0].Children.Count());
Assert.AreEqual(3, tree.Children.ToList()[1].Children.Count());
Assert.AreEqual(0, tree.Children.ToList()[2].Children.Count());
}

[Test]
public void TestRawTokenSequence()
{
var sentence = new Sentence(new List<Token>
{
Token.FromLine("1 The the DET DT Definite=Def|PronType=Art 4 det _ _"),
Token.FromLine("2 quick quick ADJ JJ Degree=Pos 4 amod _ _"),
Token.FromLine("3 brown brown ADJ JJ Degree=Pos 4 amod _ _"),
Token.FromLine("4 fox fox NOUN NN Number=Sing 5 nsubj _ _"),
Token.FromLine("5 jumps jump VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _"),
Token.FromLine("6 over over ADP IN _ 9 case _ _"),
Token.FromLine("7 the the DET DT Definite=Def|PronType=Art 9 det _ _"),
Token.FromLine("8 lazy lazy ADJ JJ Degree=Pos 9 amod _ _"),
Token.FromLine("9 dog dog NOUN NN Number=Sing 5 nmod _ SpaceAfter=No"),
Token.FromLine("10 . . PUNCT . _ 5 punct _ _")
});

Assert.AreEqual("The quick brown fox jumps over the lazy dog.", sentence.RawTokenSequence());
}
}
}
Loading

0 comments on commit d1efdcd

Please sign in to comment.