diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 9e9d06e0c8..213ac6f114 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -55,6 +55,7 @@ import java.io.IOException; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -349,6 +350,70 @@ public static List getPostingsListWithAnalyzer(IndexReader reader, Stri return getPostingsList(reader, term, analyzer); } + /** + * Returns the document vector for a particular document as a list of tokens contained in the document. Note that this + * method explicitly returns {@code null} if the document does not exist (as opposed to an empty list), so that the + * caller is explicitly forced to handle this case. + * + * @param reader index reader + * @param docid collection docid + * @return the document vector for a particular document as a list of tokens or {@code null} if document does not exist. + * @throws IOException if error encountered during query + * @throws NotStoredException if the term vector is not stored or positions are not stored + */ + public static List getDocumentTokens(IndexReader reader, String docid) throws IOException, NotStoredException { + int ldocid = convertDocidToLuceneDocid(reader, docid); + if (ldocid == -1) { + return null; + } + Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS); + if (terms == null) { + throw new NotStoredException("Document vector not stored!"); + } + if (!terms.hasPositions()) { + throw new NotStoredException("Document vector not stored!"); + } + TermsEnum te = terms.iterator(); + if (te == null) { + throw new NotStoredException("Document vector not stored!"); + } + + // We need to first find out how long the document vector is so we can allocate an array for it. + // The temptation is to just call terms.getSumTotalTermFreq(), but we can't - since this value will not include stopwords! + // The only sure way is to iterate through all the terms once to find the max position. + // Note that position is zero-based. + PostingsEnum postingsEnum = null; + int maxPos = 0; + while ((te.next()) != null) { + postingsEnum = te.postings(postingsEnum); + postingsEnum.nextDoc(); + + for (int j=0; j maxPos) { + maxPos = pos; + } + } + } + + // We now know how long to make the array. + String[] tokens = new String[maxPos + 1]; + + // Go through the terms again, this time to actually build the list of tokens. + te = reader.getTermVector(ldocid, IndexArgs.CONTENTS).iterator(); + while ((te.next()) != null) { + postingsEnum = te.postings(postingsEnum); + postingsEnum.nextDoc(); + + for (int j=0; j referenceRunOutput = new HashMap<>(); - protected Map> documents = new HashMap<>(); - protected Map>> tokens = new HashMap<>(); + protected Map> referenceDocs = new HashMap<>(); + protected Map>> referenceDocTokens = new HashMap<>(); protected Map> queryTokens = new HashMap<>(); // These are the sources of truth @@ -198,20 +198,14 @@ public void checkIndex() throws IOException { for (int i=0; i actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid); - Iterator it = actualToken.entrySet().iterator(); - while (it.hasNext()) { - Map.Entry pair = (Map.Entry)it.next(); - assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue()); - it.remove(); - } + List docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid); + assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens); } catch (NotStoredException e) { e.printStackTrace(); } diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java index 0cfe1ba967..86388417f0 100644 --- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java +++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java @@ -45,14 +45,14 @@ protected IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 3; - documents.put("TREC_DOC_1", Map.of( + referenceDocs.put("TREC_DOC_1", Map.of( "contents", "This is head very simple text", "raw", "This is head\n" + "\n" + "very simple\n" + "text\n" + "")); - documents.put("WSJ_1", Map.of( + referenceDocs.put("WSJ_1", Map.of( "contents", "head text 01/30/03 content", "raw", "\n" + "head text\n" + @@ -65,7 +65,7 @@ protected void setCheckIndexGroundTruth() { "\n" + "\n" + "")); - documents.put("DOC222", Map.of( + referenceDocs.put("DOC222", Map.of( "contents", "HEAD simple enough text text text", "raw", "HEAD\n" + "\n" + diff --git a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java index c36ca0c706..cfb6034e59 100644 --- a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java +++ b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java @@ -15,15 +15,11 @@ */ package io.anserini.integration; -import io.anserini.collection.DocumentCollection; import io.anserini.collection.JsonCollection; -import io.anserini.collection.TrecCollection; import io.anserini.index.IndexArgs; -import io.anserini.index.IndexCollection; import io.anserini.index.generator.DefaultLuceneDocumentGenerator; import io.anserini.search.SearchArgs; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -45,34 +41,30 @@ IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 2; - documents.put("2000000", Map.of( + referenceDocs.put("2000000", Map.of( "contents", "this was ##a simple pretokenized test", "raw","{\n" + " \"id\" : \"2000000\",\n" + " \"contents\" : \"this was ##a simple pretokenized test\"\n" + "}")); - documents.put("2000001", Map.of( + referenceDocs.put("2000001", Map.of( "contents", "some time extra ##vert ##ing and some time intro ##vert ##ing", "raw","{\n" + " \"id\" : \"2000001\",\n" + " \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" + "}" )); - tokens.put("2000000", Map.of( - "contents", Map.of( - "this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L))); - tokens.put("2000001",Map.of( - "contents", Map.of( - "some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L))); + referenceDocTokens.put("2000000", Map.of( + "contents", List.of("this", "was", "##a", "simple", "pretokenized", "test"))); + referenceDocTokens.put("2000001", Map.of( + "contents", List.of("some", "time", "extra", "##vert", "##ing", "and", "some", "time", "intro", "##vert", "##ing"))); fieldNormStatusTotalFields = 1; - // whitespace analyzer keeps everything, includes docid - // this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001 termIndexStatusTermCount = 15; termIndexStatusTotFreq = 15; storedFieldStatusTotalDocCounts = 2; termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts; - storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2 + storedFieldStatusTotFields = 6; } @Override diff --git a/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java b/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java index 50b6a1fc49..c50c76d4bc 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java @@ -3,6 +3,8 @@ import io.anserini.collection.TrecCollection; import io.anserini.index.IndexArgs; +import java.util.Arrays; +import java.util.List; import java.util.Map; public class TrecEndToEndExternalStopwordsTest extends EndToEndTest { @@ -20,14 +22,14 @@ protected IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 3; - documents.put("TREC_DOC_1", Map.of( + referenceDocs.put("TREC_DOC_1", Map.of( "contents", "This is head very simple text", "raw", "This is head\n" + "\n" + "very simple\n" + "text\n" + "")); - documents.put("WSJ_1", Map.of( + referenceDocs.put("WSJ_1", Map.of( "contents", "head text 01/30/03 content", "raw", "\n" + "head text\n" + @@ -40,7 +42,7 @@ protected void setCheckIndexGroundTruth() { "\n" + "\n" + "")); - documents.put("DOC222", Map.of( + referenceDocs.put("DOC222", Map.of( "contents", "HEAD simple enough text text text", "raw", "HEAD\n" + "\n" + @@ -51,6 +53,13 @@ protected void setCheckIndexGroundTruth() { "text\n" + "")); + referenceDocTokens.put("TREC_DOC_1", Map.of( + "contents", Arrays.asList(new String[]{"thi", "is", "head", "veri", null, "text"}))); + referenceDocTokens.put("WSJ_1", Map.of( + "contents", List.of("head", "text", "01", "30", "03", "content"))); + referenceDocTokens.put("DOC222", Map.of( + "contents", Arrays.asList(new String[]{"head", null, null, "text", "text", "text"}))); + // Terms per document: // d1: TREC_DOC_1 this is head very simple text // d2: DOC222 head simple enough text diff --git a/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java b/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java index c25fdbe183..dfa9fc268d 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java @@ -20,6 +20,8 @@ import io.anserini.index.IndexArgs; import io.anserini.search.SearchArgs; +import java.util.Arrays; +import java.util.List; import java.util.Map; public class TrecEndToEndPassageTest extends EndToEndTest { @@ -36,14 +38,14 @@ protected IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 3; - documents.put("TREC_DOC_1.00001", Map.of( + referenceDocs.put("TREC_DOC_1.00001", Map.of( "contents", "This is head very simple text", "raw", "This is head\n" + "\n" + "very simple\n" + "text\n" + "")); - documents.put("WSJ_1", Map.of( + referenceDocs.put("WSJ_1", Map.of( "contents", "head text 01/30/03 content", "raw", "\n" + "head text\n" + @@ -56,7 +58,7 @@ protected void setCheckIndexGroundTruth() { "\n" + "\n" + "")); - documents.put("TREC_DOC_1.00002", Map.of( + referenceDocs.put("TREC_DOC_1.00002", Map.of( "contents", "HEAD simple enough text text text", "raw", "HEAD\n" + "\n" + @@ -67,6 +69,13 @@ protected void setCheckIndexGroundTruth() { "text\n" + "")); + referenceDocTokens.put("TREC_DOC_1.00001", Map.of( + "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"}))); + referenceDocTokens.put("WSJ_1", Map.of( + "contents", List.of("head", "text", "01", "30", "03", "content"))); + referenceDocTokens.put("TREC_DOC_1.00002", Map.of( + "contents", List.of("head", "simpl", "enough", "text", "text", "text"))); + fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids. termIndexStatusTotFreq = 17; diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java index f876e0cf79..efce8d9c15 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java @@ -19,6 +19,8 @@ import io.anserini.collection.TrecCollection; import io.anserini.index.IndexArgs; +import java.util.Arrays; +import java.util.List; import java.util.Map; public class TrecEndToEndTest extends EndToEndTest { @@ -35,14 +37,14 @@ protected IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 3; - documents.put("TREC_DOC_1", Map.of( + referenceDocs.put("TREC_DOC_1", Map.of( "contents", "This is head very simple text", "raw", "This is head\n" + "\n" + "very simple\n" + "text\n" + "")); - documents.put("WSJ_1", Map.of( + referenceDocs.put("WSJ_1", Map.of( "contents", "head text 01/30/03 content", "raw", "\n" + "head text\n" + @@ -55,7 +57,7 @@ protected void setCheckIndexGroundTruth() { "\n" + "\n" + "")); - documents.put("DOC222", Map.of( + referenceDocs.put("DOC222", Map.of( "contents", "HEAD simple enough text text text", "raw", "HEAD\n" + "\n" + @@ -66,6 +68,13 @@ protected void setCheckIndexGroundTruth() { "text\n" + "")); + referenceDocTokens.put("TREC_DOC_1", Map.of( + "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"}))); + referenceDocTokens.put("WSJ_1", Map.of( + "contents", List.of("head", "text", "01", "30", "03", "content"))); + referenceDocTokens.put("DOC222", Map.of( + "contents", List.of("head", "simpl", "enough", "text", "text", "text"))); + fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids. termIndexStatusTotFreq = 17; diff --git a/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java b/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java index c57089b7ad..4aa8a76281 100644 --- a/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java +++ b/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java @@ -19,6 +19,8 @@ import io.anserini.collection.TrecCollection; import io.anserini.index.IndexArgs; +import java.util.Arrays; +import java.util.List; import java.util.Map; public class TrecEndToEndWhitelistTest extends EndToEndTest { @@ -37,7 +39,7 @@ protected IndexArgs getIndexArgs() { @Override protected void setCheckIndexGroundTruth() { docCount = 1; - documents.put("DOC222", Map.of( + referenceDocs.put("DOC222", Map.of( "contents", "HEAD simple enough text text text", "raw", "HEAD\n" + "\n" + @@ -48,6 +50,9 @@ protected void setCheckIndexGroundTruth() { "text\n" + "")); + referenceDocTokens.put("DOC222", Map.of( + "contents", List.of("head", "simpl", "enough", "text", "text", "text"))); + fieldNormStatusTotalFields = 1; // text termIndexStatusTermCount = 5; // Note that standard analyzer ignores stopwords; includes docids. termIndexStatusTotFreq = 5; diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java index 2907d5a2f3..f8d14077ea 100644 --- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java +++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java @@ -48,16 +48,16 @@ protected void setCheckIndexGroundTruth() { // Note that based on our settings, retweets and tweets with id > 9 will not be indexed. docCount = 4; - documents.put("3", Map.of( + referenceDocs.put("3", Map.of( "contents", "This tweet will be indexed thanks", "raw", "{\"created_at\":\"Thu Aug 11 22:57:52 +0000 2016\",\"id\":3,\"id_str\":\"3\",\"text\":\"This tweet will be indexed thanks.\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":3358015773,\"id_str\":\"3358015773\",\"name\":\"Cami\",\"screen_name\":\"B\",\"location\":\"Ciudad Aut\\u00f3noma de Buenos Aire\",\"url\":null,\"description\":\"15.Geminiana\\u264a Ig: CamiiMariana15 Snap: camilaracabutto\",\"protected\":false,\"verified\":false,\"followers_count\":392,\"friends_count\":307,\"listed_count\":0,\"favourites_count\":11254,\"statuses_count\":21876,\"created_at\":\"Sat Jul 04 04:32:40 +0000 2015\",\"utc_offset\":-25200,\"time_zone\":\"Pacific Time (US & Canada)\",\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_image_url_https\":\"https:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_tile\":false,\"profile_link_color\":\"9266CC\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/3358015773\\/1470945786\",\"default_profile\":false,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[],\"urls\":[],\"user_mentions\":[{\"screen_name\":\"Jul1et4wizz\",\"name\":\"Julieta\",\"id\":1599099673,\"id_str\":\"1599099673\",\"indices\":[3,15]}],\"symbols\":[]},\"favorited\":false,\"retweeted\":false,\"filter_level\":\"low\",\"lang\":\"en\",\"timestamp_ms\":\"1470956272659\"}")); - documents.put("5", Map.of( + referenceDocs.put("5", Map.of( "contents", "Can you think of more interesting contents", "raw", "{\"created_at\":\"Thu Aug 11 23:57:52 +0000 2016\",\"id\":5,\"id_str\":\"5\",\"text\":\"Can you think of more interesting contents?\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":3358015773,\"id_str\":\"3358015773\",\"name\":\"Cami\",\"screen_name\":\"C\",\"location\":\"Ciudad Aut\\u00f3noma de Buenos Aire\",\"url\":null,\"description\":\"15.Geminiana\\u264a Ig: CamiiMariana15 Snap: camilaracabutto\",\"protected\":false,\"verified\":false,\"followers_count\":392,\"friends_count\":307,\"listed_count\":0,\"favourites_count\":11254,\"statuses_count\":21876,\"created_at\":\"Sat Jul 04 04:32:40 +0000 2015\",\"utc_offset\":-25200,\"time_zone\":\"Pacific Time (US & Canada)\",\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_image_url_https\":\"https:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_tile\":false,\"profile_link_color\":\"9266CC\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/3358015773\\/1470945786\",\"default_profile\":false,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[],\"urls\":[],\"user_mentions\":[{\"screen_name\":\"Jul1et4wizz\",\"name\":\"Julieta\",\"id\":1599099673,\"id_str\":\"1599099673\",\"indices\":[3,15]}],\"symbols\":[]},\"favorited\":false,\"retweeted\":false,\"filter_level\":\"low\",\"lang\":\"cn\",\"timestamp_ms\":\"1470956272659\"}")); - documents.put("6", Map.of( + referenceDocs.put("6", Map.of( "contents", "We have some real contents here thanks", "raw", "{\"created_at\":\"Thu Aug 11 21:57:50 +0000 2016\",\"id\":6,\"id_str\":\"6\",\"text\":\"We have some real contents here thanks https:\\/\\/t.co\\/1a2b3c\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":763875115104960516,\"id_str\":\"763875115104960516\",\"name\":\"Esequiel Manson\",\"screen_name\":\"X\",\"location\":\"San Miguel, Argentina\",\"url\":null,\"description\":null,\"protected\":false,\"verified\":false,\"followers_count\":0,\"friends_count\":2,\"listed_count\":0,\"favourites_count\":2,\"statuses_count\":2,\"created_at\":\"Thu Aug 11 23:09:54 +0000 2016\",\"utc_offset\":null,\"time_zone\":null,\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"F5F8FA\",\"profile_background_image_url\":\"\",\"profile_background_image_url_https\":\"\",\"profile_background_tile\":false,\"profile_link_color\":\"2B7BB9\",\"profile_sidebar_border_color\":\"C0DEED\",\"profile_sidebar_fill_color\":\"DDEEF6\",\"profile_text_color\":\"333333\",\"profile_use_background_image\":true,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/763875115104960516\\/1470957611\",\"default_profile\":true,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[{\"text\":\"perfil\",\"indices\":[0,7]},{\"text\":\"tattoo\",\"indices\":[8,15]},{\"text\":\"feo\",\"indices\":[16,20]},{\"text\":\"paisaje\",\"indices\":[21,29]}],\"urls\":[],\"user_mentions\":[],\"symbols\":[],\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"extended_entities\":{\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"favorited\":false,\"retweeted\":false,\"possibly_sensitive\":false,\"filter_level\":\"low\",\"lang\":\"und\",\"timestamp_ms\":\"1470959870658\"}")); - documents.put("8", Map.of( + referenceDocs.put("8", Map.of( "contents", "test adding more tweet", "raw", "{\"created_at\":\"Thu Aug 11 22:57:50 +0000 2016\",\"id\":8,\"id_str\":\"8\",\"text\":\"test adding more tweets\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":763875115104960516,\"id_str\":\"763875115104960516\",\"name\":\"Esequiel Manson\",\"screen_name\":\"Y\",\"location\":\"San Miguel, Argentina\",\"url\":null,\"description\":null,\"protected\":false,\"verified\":false,\"followers_count\":0,\"friends_count\":2,\"listed_count\":0,\"favourites_count\":2,\"statuses_count\":2,\"created_at\":\"Thu Aug 11 23:09:54 +0000 2016\",\"utc_offset\":null,\"time_zone\":null,\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"F5F8FA\",\"profile_background_image_url\":\"\",\"profile_background_image_url_https\":\"\",\"profile_background_tile\":false,\"profile_link_color\":\"2B7BB9\",\"profile_sidebar_border_color\":\"C0DEED\",\"profile_sidebar_fill_color\":\"DDEEF6\",\"profile_text_color\":\"333333\",\"profile_use_background_image\":true,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/763875115104960516\\/1470957611\",\"default_profile\":true,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[{\"text\":\"perfil\",\"indices\":[0,7]},{\"text\":\"tattoo\",\"indices\":[8,15]},{\"text\":\"feo\",\"indices\":[16,20]},{\"text\":\"paisaje\",\"indices\":[21,29]}],\"urls\":[],\"user_mentions\":[],\"symbols\":[],\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"extended_entities\":{\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"favorited\":false,\"retweeted\":false,\"possibly_sensitive\":false,\"filter_level\":\"low\",\"lang\":\"ab\",\"timestamp_ms\":\"1470959870658\"}"));