From bad2c86d4aa61db8f3d45924bb0036be630f2ec4 Mon Sep 17 00:00:00 2001
From: Peter B <5107405+pmbaumgartner@users.noreply.github.com>
Date: Thu, 29 Oct 2020 16:28:43 -0400
Subject: [PATCH] first commit

---
 .github/workflows/CompatHelper.yml |  16 ++
 .github/workflows/TagBot.yml       |  13 +
 .gitignore                         |  24 ++
 .travis.yml                        |  28 ++
 LICENSE                            |  21 ++
 Project.toml                       |  24 ++
 README.md                          | 160 ++++++++++++
 src/PushshiftRedditDistiller.jl    |  18 ++
 src/deps.jl                        | 403 +++++++++++++++++++++++++++++
 src/distiller.jl                   |  41 +++
 src/filter.jl                      |  30 +++
 src/types.jl                       |  25 ++
 src/utils.jl                       |  36 +++
 test/runtests.jl                   |  19 ++
 14 files changed, 858 insertions(+)
 create mode 100644 .github/workflows/CompatHelper.yml
 create mode 100644 .github/workflows/TagBot.yml
 create mode 100644 .gitignore
 create mode 100644 .travis.yml
 create mode 100644 LICENSE
 create mode 100644 Project.toml
 create mode 100644 README.md
 create mode 100644 src/PushshiftRedditDistiller.jl
 create mode 100644 src/deps.jl
 create mode 100644 src/distiller.jl
 create mode 100644 src/filter.jl
 create mode 100644 src/types.jl
 create mode 100644 src/utils.jl
 create mode 100644 test/runtests.jl

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 0000000..cba9134
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,16 @@
+name: CompatHelper
+on:
+  schedule:
+    - cron: 0 0 * * *
+  workflow_dispatch:
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 0000000..e72d645
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,13 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 0 * * *
+  workflow_dispatch:
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..29126e4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,24 @@
+# Files generated by invoking Julia with --code-coverage
+*.jl.cov
+*.jl.*.cov
+
+# Files generated by invoking Julia with --track-allocation
+*.jl.mem
+
+# System-specific files and directories generated by the BinaryProvider and BinDeps packages
+# They contain absolute paths specific to the host computer, and so should not be committed
+deps/deps.jl
+deps/build.log
+deps/downloads/
+deps/usr/
+deps/src/
+
+# Build artifacts for creating documentation generated by the Documenter package
+docs/build/
+docs/site/
+
+# File generated by Pkg, the package manager, based on a corresponding Project.toml
+# It records a fixed state of all packages used by the project. As such, it should not be
+# committed for packages, but should be committed for applications that require a static
+# environment.
+Manifest.toml
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..e4d4661
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,28 @@
+# Documentation: http://docs.travis-ci.com/user/languages/julia
+language: julia
+notifications:
+  email: false
+julia:
+  - 1.0
+  - 1.5
+  - nightly
+os:
+  - linux
+  - osx
+  - windows
+arch:
+  - x64
+cache:
+  directories:
+    - ~/.julia/artifacts
+jobs:
+  fast_finish: true
+  allow_failures:
+    - julia: nightly
+after_success:
+  - |
+    julia -e '
+      using Pkg
+      Pkg.add("Coverage")
+      using Coverage
+      Codecov.submit(process_folder())'
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9cdcfba
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 RTI International and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 0000000..e354c01
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,24 @@
+name = "PushshiftRedditDistiller"
+uuid = "860182d5-2f7f-4b01-a084-d8f9ccad7d63"
+authors = ["Peter B <5107405+pmbaumgartner@users.noreply.github.com> and contributors"]
+version = "0.1.0"
+
+[deps]
+CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
+CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
+DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
+ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
+TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
+
+[compat]
+julia = "1"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d5bb821
--- /dev/null
+++ b/README.md
@@ -0,0 +1,160 @@
+# PushshiftRedditDistiller
+
+This package is intended to assist with downloading, extracting, and distilling the monthly reddit data dumps made available through [pushshift.io](https://files.pushshift.io/reddit/).
+
+# Example Use
+
+## Preexisting File
+If you already have a monthly submissions or comments file downloaded:
+
+```julia
+julia> using PushshiftRedditDistiller
+
+julia> filter = RedditDataFilter(author=["spez"])
+
+julia> spez_comments = distill("~/Downloads/RC_2005-12.bz2", filter)
+
+julia> length(spez_comments)
+7
+
+julia> typeof(spez_comments)
+Array{Dict{Symbol,Any},1}
+
+julia> first(spez_comments)
+Dict{Symbol,Any} with 18 entries:
+  :author_flair_css_class => nothing
+  :gilded                 => 0
+  :parent_id              => "t3_17942"
+  :score                  => 4
+  :link_id                => "t3_17942"
+  :created_utc            => 1134392748
+  :author_flair_text      => nothing
+  :distinguished          => nothing
+  :author                 => "spez"
+  :stickied               => false
+  :subreddit              => "reddit.com"
+  :subreddit_id           => "t5_6"
+  :id                     => "c53"
+  :retrieved_on           => 1473738414
+  :body                   => "still looks like a death trap to me..."
+  :controversiality       => 0
+  :ups                    => 4
+  :edited                 => false
+```
+
+## DataDeps Catalog
+
+All monthly comment and submissions files are cataloged and available using [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl). The format for the `datadep` string macros are `reddit-comments-YYYY-MM` for comments and `reddit-submissions-YYYY-MM` for submissions.
+
+If the file isn't downloaded, you will be prompted to download that archive file before processing.
+
+```julia
+julia> using DataDeps
+
+julia> more_spez_comments = distill(datadep"reddit-comments-2006-04", filter)
+This program has requested access to the data dependency reddit-comments-2006-04.
+which is not currently installed. It can be installed automatically, and you will not see this message again.
+
+Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). 
+The pushshift reddit dataset.
+In Proceedings of the International AAAI Conference on Web and Social Media 
+(Vol. 14, pp. 830-839).
+
+
+
+Do you want to download the dataset from https://files.pushshift.io/reddit/comments/RC_2006-04.bz2 to "~/.julia/datadeps/reddit-comments-2006-04"?
+[y/n]
+y
+
+┌ Info: Downloading
+│   source = "https://files.pushshift.io/reddit/comments/RC_2006-04.bz2"
+│   dest = "~/.julia/datadeps/reddit-comments-2006-04/RC_2006-04.bz2"
+│   progress = 1.0
+│   time_taken = "0.51 s"
+│   time_remaining = "0.0 s"
+│   average_speed = "3.729 MiB/s"
+│   downloaded = "1.891 MiB"
+│   remaining = "0 bytes"
+└   total = "1.891 MiB"
+┌ Warning: Checksum not provided, add to the Datadep Registration the following hash line
+│   hash = "1e757b8a7dd4b1f7281329ac77cf4a20f59571d59899983fd7f347b24b081516"
+
+julia> length(more_spez_comments)
+19
+```
+
+## Multithreaded Support
+
+If you start Julia with more than one thread available, multithreading will be enabled. It's best to play around with the number of threads for a bit if you're looking to optimize parsing speed, as it depends on how complex your filter is and the decompression algorithm available. For example, slower decompression algorithms (`bz2`) will bottleneck the speed in which you can feed lines from the stream to each thread to parse. For faster algorithms, you may have a strict filter that doesn't result in many lines needing to be fully parsed, making the overhead of coordinating threads costly.
+
+The number of threads used while parsing will be displayed in the progress bar.
+
+```bash
+$ julia -t 3
+```
+...
+
+```julia
+julia> results = distill(datadep"reddit-comments-2006-04", filter)
+File: RC_2006-04.bz2; Threads: 3 ┣ ╱   ╱   ╱   ╱   ╱   ╱   ╱   ╱   ╱ ┫ 19090it 00:01 [35469.7 it/s]
+```
+
+
+## Filtering with `RedditDataFilter`
+
+Data can be filtered on the `author` or `subreddit` field currently. The filtering is currently disjunctive (OR), so if both `author` and `subreddit` are passed, it will return data from those author(s) OR those subreddit(s).
+
+In addition, you can control which fields are returned with the `fields` argument.
+
+All arguments are of type `Vector{String}`, though passing a single string to an argument will convert it to a length 1 `Vector`.
+
+**Note that no checking of correct field names is done for you, since the fields available change over time**
+
+```
+julia> using Dates
+
+julia> timestamps_only = RedditDataFilter(fields=["created_utc"])
+
+julia> timestamp_comments = distill(datadep"reddit-comments-2006-04", timestamps_only)
+
+julia> Dates.unix2datetime(first(timestamp_comments)[:created_utc])
+2006-04-01T00:00:55
+```
+
+## Other Usage Notes
+
+### Exporting
+
+`distill` returns `Vector{Dict{Symbol,Any}}`, which can be fed into a `DataFrame` from [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) (not included).
+
+```julia
+julia> using DataFrames, CSV
+
+julia> spez_comments_df = DataFrame(spez_comments)
+
+julia> CSV.write("spez_comments.csv", spez_comments_df, quotestrings=true)
+```
+
+### Managing DataDeps
+
+Some of the files are large - if you were to download the whole archive it would be over one TB. Because of this, you may want to remove a file after use or change your DataDeps download directory to another drive.
+
+**Removal**
+
+```julia
+julia> rm(datadep"reddit-comments-2006-04", recursive=true)
+```
+
+**New Directory**
+
+```julia
+julia> download_path = "/Users/user/pushshift-datadeps"
+
+julia> mkdir(download_path)
+
+julia> ENV["DATADEPS_LOAD_PATH"] = download_path
+
+julia> ENV["DATADEPS_NO_STANDARD_LOAD_PATH"] = true
+```
+
+
diff --git a/src/PushshiftRedditDistiller.jl b/src/PushshiftRedditDistiller.jl
new file mode 100644
index 0000000..5a661de
--- /dev/null
+++ b/src/PushshiftRedditDistiller.jl
@@ -0,0 +1,18 @@
+module PushshiftRedditDistiller
+
+using DataDeps
+
+export RedditDataFilter, distill
+
+include("deps.jl")
+include("types.jl")
+include("distiller.jl")
+include("filter.jl")
+include("utils.jl")
+
+function __init__()
+    init_deps()
+end
+
+
+end
diff --git a/src/deps.jl b/src/deps.jl
new file mode 100644
index 0000000..fb13e36
--- /dev/null
+++ b/src/deps.jl
@@ -0,0 +1,403 @@
+using DataDeps
+
+comments_metadata = [
+    (file = "RC_2005-12.bz2", name = "2005-12"),
+    (file = "RC_2006-01.bz2", name = "2006-01"),
+    (file = "RC_2006-02.bz2", name = "2006-02"),
+    (file = "RC_2006-03.bz2", name = "2006-03"),
+    (file = "RC_2006-04.bz2", name = "2006-04"),
+    (file = "RC_2006-05.bz2", name = "2006-05"),
+    (file = "RC_2006-06.bz2", name = "2006-06"),
+    (file = "RC_2006-07.bz2", name = "2006-07"),
+    (file = "RC_2006-08.bz2", name = "2006-08"),
+    (file = "RC_2006-09.bz2", name = "2006-09"),
+    (file = "RC_2006-10.bz2", name = "2006-10"),
+    (file = "RC_2006-11.bz2", name = "2006-11"),
+    (file = "RC_2006-12.bz2", name = "2006-12"),
+    (file = "RC_2007-01.bz2", name = "2007-01"),
+    (file = "RC_2007-02.bz2", name = "2007-02"),
+    (file = "RC_2007-03.bz2", name = "2007-03"),
+    (file = "RC_2007-04.bz2", name = "2007-04"),
+    (file = "RC_2007-05.bz2", name = "2007-05"),
+    (file = "RC_2007-06.bz2", name = "2007-06"),
+    (file = "RC_2007-07.bz2", name = "2007-07"),
+    (file = "RC_2007-08.bz2", name = "2007-08"),
+    (file = "RC_2007-09.bz2", name = "2007-09"),
+    (file = "RC_2007-10.bz2", name = "2007-10"),
+    (file = "RC_2007-11.bz2", name = "2007-11"),
+    (file = "RC_2007-12.bz2", name = "2007-12"),
+    (file = "RC_2008-01.bz2", name = "2008-01"),
+    (file = "RC_2008-02.bz2", name = "2008-02"),
+    (file = "RC_2008-03.bz2", name = "2008-03"),
+    (file = "RC_2008-04.bz2", name = "2008-04"),
+    (file = "RC_2008-05.bz2", name = "2008-05"),
+    (file = "RC_2008-06.bz2", name = "2008-06"),
+    (file = "RC_2008-07.bz2", name = "2008-07"),
+    (file = "RC_2008-08.bz2", name = "2008-08"),
+    (file = "RC_2008-09.bz2", name = "2008-09"),
+    (file = "RC_2008-10.bz2", name = "2008-10"),
+    (file = "RC_2008-11.bz2", name = "2008-11"),
+    (file = "RC_2008-12.bz2", name = "2008-12"),
+    (file = "RC_2009-01.bz2", name = "2009-01"),
+    (file = "RC_2009-02.bz2", name = "2009-02"),
+    (file = "RC_2009-03.bz2", name = "2009-03"),
+    (file = "RC_2009-04.bz2", name = "2009-04"),
+    (file = "RC_2009-05.bz2", name = "2009-05"),
+    (file = "RC_2009-06.bz2", name = "2009-06"),
+    (file = "RC_2009-07.bz2", name = "2009-07"),
+    (file = "RC_2009-08.bz2", name = "2009-08"),
+    (file = "RC_2009-09.bz2", name = "2009-09"),
+    (file = "RC_2009-10.bz2", name = "2009-10"),
+    (file = "RC_2009-11.bz2", name = "2009-11"),
+    (file = "RC_2009-12.bz2", name = "2009-12"),
+    (file = "RC_2010-01.bz2", name = "2010-01"),
+    (file = "RC_2010-02.bz2", name = "2010-02"),
+    (file = "RC_2010-03.bz2", name = "2010-03"),
+    (file = "RC_2010-04.bz2", name = "2010-04"),
+    (file = "RC_2010-05.bz2", name = "2010-05"),
+    (file = "RC_2010-06.bz2", name = "2010-06"),
+    (file = "RC_2010-07.bz2", name = "2010-07"),
+    (file = "RC_2010-08.bz2", name = "2010-08"),
+    (file = "RC_2010-09.bz2", name = "2010-09"),
+    (file = "RC_2010-10.bz2", name = "2010-10"),
+    (file = "RC_2010-11.bz2", name = "2010-11"),
+    (file = "RC_2010-12.bz2", name = "2010-12"),
+    (file = "RC_2011-01.bz2", name = "2011-01"),
+    (file = "RC_2011-02.bz2", name = "2011-02"),
+    (file = "RC_2011-03.bz2", name = "2011-03"),
+    (file = "RC_2011-04.bz2", name = "2011-04"),
+    (file = "RC_2011-05.bz2", name = "2011-05"),
+    (file = "RC_2011-06.bz2", name = "2011-06"),
+    (file = "RC_2011-07.bz2", name = "2011-07"),
+    (file = "RC_2011-08.bz2", name = "2011-08"),
+    (file = "RC_2011-09.bz2", name = "2011-09"),
+    (file = "RC_2011-10.bz2", name = "2011-10"),
+    (file = "RC_2011-11.bz2", name = "2011-11"),
+    (file = "RC_2011-12.bz2", name = "2011-12"),
+    (file = "RC_2012-01.bz2", name = "2012-01"),
+    (file = "RC_2012-02.bz2", name = "2012-02"),
+    (file = "RC_2012-03.bz2", name = "2012-03"),
+    (file = "RC_2012-04.bz2", name = "2012-04"),
+    (file = "RC_2012-05.bz2", name = "2012-05"),
+    (file = "RC_2012-06.bz2", name = "2012-06"),
+    (file = "RC_2012-07.bz2", name = "2012-07"),
+    (file = "RC_2012-08.bz2", name = "2012-08"),
+    (file = "RC_2012-09.bz2", name = "2012-09"),
+    (file = "RC_2012-10.bz2", name = "2012-10"),
+    (file = "RC_2012-11.bz2", name = "2012-11"),
+    (file = "RC_2012-12.bz2", name = "2012-12"),
+    (file = "RC_2013-01.bz2", name = "2013-01"),
+    (file = "RC_2013-02.bz2", name = "2013-02"),
+    (file = "RC_2013-03.bz2", name = "2013-03"),
+    (file = "RC_2013-04.bz2", name = "2013-04"),
+    (file = "RC_2013-05.bz2", name = "2013-05"),
+    (file = "RC_2013-06.bz2", name = "2013-06"),
+    (file = "RC_2013-07.bz2", name = "2013-07"),
+    (file = "RC_2013-08.bz2", name = "2013-08"),
+    (file = "RC_2013-09.bz2", name = "2013-09"),
+    (file = "RC_2013-10.bz2", name = "2013-10"),
+    (file = "RC_2013-11.bz2", name = "2013-11"),
+    (file = "RC_2013-12.bz2", name = "2013-12"),
+    (file = "RC_2014-01.bz2", name = "2014-01"),
+    (file = "RC_2014-02.bz2", name = "2014-02"),
+    (file = "RC_2014-03.bz2", name = "2014-03"),
+    (file = "RC_2014-04.bz2", name = "2014-04"),
+    (file = "RC_2014-05.bz2", name = "2014-05"),
+    (file = "RC_2014-06.bz2", name = "2014-06"),
+    (file = "RC_2014-07.bz2", name = "2014-07"),
+    (file = "RC_2014-08.bz2", name = "2014-08"),
+    (file = "RC_2014-09.bz2", name = "2014-09"),
+    (file = "RC_2014-10.bz2", name = "2014-10"),
+    (file = "RC_2014-11.bz2", name = "2014-11"),
+    (file = "RC_2014-12.bz2", name = "2014-12"),
+    (file = "RC_2015-01.bz2", name = "2015-01"),
+    (file = "RC_2015-02.bz2", name = "2015-02"),
+    (file = "RC_2015-03.bz2", name = "2015-03"),
+    (file = "RC_2015-04.bz2", name = "2015-04"),
+    (file = "RC_2015-05.bz2", name = "2015-05"),
+    (file = "RC_2015-06.bz2", name = "2015-06"),
+    (file = "RC_2015-07.bz2", name = "2015-07"),
+    (file = "RC_2015-08.bz2", name = "2015-08"),
+    (file = "RC_2015-09.bz2", name = "2015-09"),
+    (file = "RC_2015-10.bz2", name = "2015-10"),
+    (file = "RC_2015-11.bz2", name = "2015-11"),
+    (file = "RC_2015-12.bz2", name = "2015-12"),
+    (file = "RC_2016-01.bz2", name = "2016-01"),
+    (file = "RC_2016-02.bz2", name = "2016-02"),
+    (file = "RC_2016-03.bz2", name = "2016-03"),
+    (file = "RC_2016-04.bz2", name = "2016-04"),
+    (file = "RC_2016-05.bz2", name = "2016-05"),
+    (file = "RC_2016-06.bz2", name = "2016-06"),
+    (file = "RC_2016-07.bz2", name = "2016-07"),
+    (file = "RC_2016-08.bz2", name = "2016-08"),
+    (file = "RC_2016-09.bz2", name = "2016-09"),
+    (file = "RC_2016-10.bz2", name = "2016-10"),
+    (file = "RC_2016-11.bz2", name = "2016-11"),
+    (file = "RC_2016-12.bz2", name = "2016-12"),
+    (file = "RC_2017-01.bz2", name = "2017-01"),
+    (file = "RC_2017-02.bz2", name = "2017-02"),
+    (file = "RC_2017-03.bz2", name = "2017-03"),
+    (file = "RC_2017-04.bz2", name = "2017-04"),
+    (file = "RC_2017-05.bz2", name = "2017-05"),
+    (file = "RC_2017-06.bz2", name = "2017-06"),
+    (file = "RC_2017-07.bz2", name = "2017-07"),
+    (file = "RC_2017-08.bz2", name = "2017-08"),
+    (file = "RC_2017-09.bz2", name = "2017-09"),
+    (file = "RC_2017-10.bz2", name = "2017-10"),
+    (file = "RC_2017-11.bz2", name = "2017-11"),
+    (file = "RC_2017-12.xz", name = "2017-12"),
+    (file = "RC_2018-01.xz", name = "2018-01"),
+    (file = "RC_2018-02.xz", name = "2018-02"),
+    (file = "RC_2018-03.xz", name = "2018-03"),
+    (file = "RC_2018-04.xz", name = "2018-04"),
+    (file = "RC_2018-05.xz", name = "2018-05"),
+    (file = "RC_2018-06.xz", name = "2018-06"),
+    (file = "RC_2018-07.xz", name = "2018-07"),
+    (file = "RC_2018-08.xz", name = "2018-08"),
+    (file = "RC_2018-09.xz", name = "2018-09"),
+    (file = "RC_2018-10.zst", name = "2018-10"),
+    (file = "RC_2018-11.zst", name = "2018-11"),
+    (file = "RC_2018-12.zst", name = "2018-12"),
+    (file = "RC_2019-01.zst", name = "2019-01"),
+    (file = "RC_2019-02.zst", name = "2019-02"),
+    (file = "RC_2019-03.zst", name = "2019-03"),
+    (file = "RC_2019-04.zst", name = "2019-04"),
+    (file = "RC_2019-05.zst", name = "2019-05"),
+    (file = "RC_2019-06.zst", name = "2019-06"),
+    (file = "RC_2019-07.zst", name = "2019-07"),
+    (file = "RC_2019-08.zst", name = "2019-08"),
+    (file = "RC_2019-09.zst", name = "2019-09"),
+    (file = "RC_2019-10.zst", name = "2019-10"),
+    (file = "RC_2019-11.zst", name = "2019-11"),
+    (file = "RC_2019-12.zst", name = "2019-12"),
+]
+
+submissions_metadata = [
+    (file = "RS_v2_2005-06.xz", name = "2005-06"),
+    (file = "RS_v2_2005-07.xz", name = "2005-07"),
+    (file = "RS_v2_2005-08.xz", name = "2005-08"),
+    (file = "RS_v2_2005-09.xz", name = "2005-09"),
+    (file = "RS_v2_2005-10.xz", name = "2005-10"),
+    (file = "RS_v2_2005-11.xz", name = "2005-11"),
+    (file = "RS_v2_2005-12.xz", name = "2005-12"),
+    (file = "RS_v2_2006-01.xz", name = "2006-01"),
+    (file = "RS_v2_2006-02.xz", name = "2006-02"),
+    (file = "RS_v2_2006-03.xz", name = "2006-03"),
+    (file = "RS_v2_2006-04.xz", name = "2006-04"),
+    (file = "RS_v2_2006-05.xz", name = "2006-05"),
+    (file = "RS_v2_2006-06.xz", name = "2006-06"),
+    (file = "RS_v2_2006-07.xz", name = "2006-07"),
+    (file = "RS_v2_2006-08.xz", name = "2006-08"),
+    (file = "RS_v2_2006-09.xz", name = "2006-09"),
+    (file = "RS_v2_2006-10.xz", name = "2006-10"),
+    (file = "RS_v2_2006-11.xz", name = "2006-11"),
+    (file = "RS_v2_2006-12.xz", name = "2006-12"),
+    (file = "RS_v2_2007-01.xz", name = "2007-01"),
+    (file = "RS_v2_2007-02.xz", name = "2007-02"),
+    (file = "RS_v2_2007-03.xz", name = "2007-03"),
+    (file = "RS_v2_2007-04.xz", name = "2007-04"),
+    (file = "RS_v2_2007-05.xz", name = "2007-05"),
+    (file = "RS_v2_2007-06.xz", name = "2007-06"),
+    (file = "RS_v2_2007-07.xz", name = "2007-07"),
+    (file = "RS_v2_2007-08.xz", name = "2007-08"),
+    (file = "RS_v2_2007-09.xz", name = "2007-09"),
+    (file = "RS_v2_2007-10.xz", name = "2007-10"),
+    (file = "RS_v2_2007-11.xz", name = "2007-11"),
+    (file = "RS_v2_2007-12.xz", name = "2007-12"),
+    (file = "RS_v2_2008-02.xz", name = "2008-02"),
+    (file = "RS_v2_2008-03.xz", name = "2008-03"),
+    (file = "RS_v2_2008-04.xz", name = "2008-04"),
+    (file = "RS_v2_2008-05.xz", name = "2008-05"),
+    (file = "RS_v2_2008-07.xz", name = "2008-07"),
+    (file = "RS_v2_2008-08.xz", name = "2008-08"),
+    (file = "RS_v2_2008-09.xz", name = "2008-09"),
+    (file = "RS_v2_2008-10.xz", name = "2008-10"),
+    (file = "RS_v2_2008-11.xz", name = "2008-11"),
+    (file = "RS_v2_2008-12.xz", name = "2008-12"),
+    (file = "RS_v2_2009-01.xz", name = "2009-01"),
+    (file = "RS_v2_2009-02.xz", name = "2009-02"),
+    (file = "RS_v2_2009-03.xz", name = "2009-03"),
+    (file = "RS_v2_2009-04.xz", name = "2009-04"),
+    (file = "RS_v2_2009-05.xz", name = "2009-05"),
+    (file = "RS_v2_2009-06.xz", name = "2009-06"),
+    (file = "RS_v2_2009-07.xz", name = "2009-07"),
+    (file = "RS_v2_2009-08.xz", name = "2009-08"),
+    (file = "RS_v2_2009-09.xz", name = "2009-09"),
+    (file = "RS_v2_2009-10.xz", name = "2009-10"),
+    (file = "RS_v2_2009-11.xz", name = "2009-11"),
+    (file = "RS_v2_2009-12.xz", name = "2009-12"),
+    (file = "RS_v2_2010-01.xz", name = "2010-01"),
+    (file = "RS_v2_2010-02.xz", name = "2010-02"),
+    (file = "RS_v2_2010-03.xz", name = "2010-03"),
+    (file = "RS_v2_2010-04.xz", name = "2010-04"),
+    (file = "RS_v2_2010-05.xz", name = "2010-05"),
+    (file = "RS_v2_2010-06.xz", name = "2010-06"),
+    (file = "RS_v2_2010-07.xz", name = "2010-07"),
+    (file = "RS_v2_2010-08.xz", name = "2010-08"),
+    (file = "RS_v2_2010-09.xz", name = "2010-09"),
+    (file = "RS_v2_2010-10.xz", name = "2010-10"),
+    (file = "RS_v2_2010-11.xz", name = "2010-11"),
+    (file = "RS_v2_2010-12.xz", name = "2010-12"),
+    (file = "RS_2011-01.bz2", name = "2011-01"),
+    (file = "RS_2011-02.bz2", name = "2011-02"),
+    (file = "RS_2011-03.bz2", name = "2011-03"),
+    (file = "RS_2011-04.bz2", name = "2011-04"),
+    (file = "RS_2011-05.bz2", name = "2011-05"),
+    (file = "RS_2011-06.bz2", name = "2011-06"),
+    (file = "RS_2011-07.bz2", name = "2011-07"),
+    (file = "RS_2011-08.bz2", name = "2011-08"),
+    (file = "RS_2011-09.bz2", name = "2011-09"),
+    (file = "RS_2011-10.bz2", name = "2011-10"),
+    (file = "RS_2011-11.bz2", name = "2011-11"),
+    (file = "RS_2011-12.bz2", name = "2011-12"),
+    (file = "RS_2012-01.bz2", name = "2012-01"),
+    (file = "RS_2012-02.bz2", name = "2012-02"),
+    (file = "RS_2012-03.bz2", name = "2012-03"),
+    (file = "RS_2012-04.bz2", name = "2012-04"),
+    (file = "RS_2012-05.bz2", name = "2012-05"),
+    (file = "RS_2012-06.bz2", name = "2012-06"),
+    (file = "RS_2012-07.bz2", name = "2012-07"),
+    (file = "RS_2012-08.bz2", name = "2012-08"),
+    (file = "RS_2012-09.bz2", name = "2012-09"),
+    (file = "RS_2012-10.bz2", name = "2012-10"),
+    (file = "RS_2012-11.bz2", name = "2012-11"),
+    (file = "RS_2012-12.bz2", name = "2012-12"),
+    (file = "RS_2013-01.bz2", name = "2013-01"),
+    (file = "RS_2013-02.bz2", name = "2013-02"),
+    (file = "RS_2013-03.bz2", name = "2013-03"),
+    (file = "RS_2013-04.bz2", name = "2013-04"),
+    (file = "RS_2013-05.bz2", name = "2013-05"),
+    (file = "RS_2013-06.bz2", name = "2013-06"),
+    (file = "RS_2013-07.bz2", name = "2013-07"),
+    (file = "RS_2013-08.bz2", name = "2013-08"),
+    (file = "RS_2013-09.bz2", name = "2013-09"),
+    (file = "RS_2013-10.bz2", name = "2013-10"),
+    (file = "RS_2013-11.bz2", name = "2013-11"),
+    (file = "RS_2013-12.bz2", name = "2013-12"),
+    (file = "RS_2014-01.bz2", name = "2014-01"),
+    (file = "RS_2014-02.bz2", name = "2014-02"),
+    (file = "RS_2014-03.bz2", name = "2014-03"),
+    (file = "RS_2014-04.bz2", name = "2014-04"),
+    (file = "RS_2014-05.bz2", name = "2014-05"),
+    (file = "RS_2014-06.bz2", name = "2014-06"),
+    (file = "RS_2014-07.bz2", name = "2014-07"),
+    (file = "RS_2014-08.bz2", name = "2014-08"),
+    (file = "RS_2014-09.bz2", name = "2014-09"),
+    (file = "RS_2014-10.bz2", name = "2014-10"),
+    (file = "RS_2014-11.bz2", name = "2014-11"),
+    (file = "RS_2014-12.bz2", name = "2014-12"),
+    (file = "RS_2015-01.zst", name = "2015-01"),
+    (file = "RS_2015-02.zst", name = "2015-02"),
+    (file = "RS_2015-03.zst", name = "2015-03"),
+    (file = "RS_2015-04.zst", name = "2015-04"),
+    (file = "RS_2015-05.zst", name = "2015-05"),
+    (file = "RS_2015-06.zst", name = "2015-06"),
+    (file = "RS_2015-07.zst", name = "2015-07"),
+    (file = "RS_2015-08.zst", name = "2015-08"),
+    (file = "RS_2015-09.zst", name = "2015-09"),
+    (file = "RS_2015-10.zst", name = "2015-10"),
+    (file = "RS_2015-11.zst", name = "2015-11"),
+    (file = "RS_2015-12.zst", name = "2015-12"),
+    (file = "RS_2016-01.zst", name = "2016-01"),
+    (file = "RS_2016-02.zst", name = "2016-02"),
+    (file = "RS_2016-03.zst", name = "2016-03"),
+    (file = "RS_2016-04.zst", name = "2016-04"),
+    (file = "RS_2016-05.zst", name = "2016-05"),
+    (file = "RS_2016-06.zst", name = "2016-06"),
+    (file = "RS_2016-07.zst", name = "2016-07"),
+    (file = "RS_2016-08.zst", name = "2016-08"),
+    (file = "RS_2016-09.zst", name = "2016-09"),
+    (file = "RS_2016-10.zst", name = "2016-10"),
+    (file = "RS_2016-11.zst", name = "2016-11"),
+    (file = "RS_2016-12.zst", name = "2016-12"),
+    (file = "RS_2017-01.bz2", name = "2017-01"),
+    (file = "RS_2017-02.bz2", name = "2017-02"),
+    (file = "RS_2017-03.bz2", name = "2017-03"),
+    (file = "RS_2017-04.bz2", name = "2017-04"),
+    (file = "RS_2017-05.bz2", name = "2017-05"),
+    (file = "RS_2017-06.bz2", name = "2017-06"),
+    (file = "RS_2017-07.bz2", name = "2017-07"),
+    (file = "RS_2017-08.bz2", name = "2017-08"),
+    (file = "RS_2017-09.bz2", name = "2017-09"),
+    (file = "RS_2017-10.bz2", name = "2017-10"),
+    (file = "RS_2017-11.xz", name = "2017-11"),
+    (file = "RS_2017-12.xz", name = "2017-12"),
+    (file = "RS_2018-01.xz", name = "2018-01"),
+    (file = "RS_2018-02.xz", name = "2018-02"),
+    (file = "RS_2018-03.xz", name = "2018-03"),
+    (file = "RS_2018-04.xz", name = "2018-04"),
+    (file = "RS_2018-05.xz", name = "2018-05"),
+    (file = "RS_2018-06.xz", name = "2018-06"),
+    (file = "RS_2018-07.xz", name = "2018-07"),
+    (file = "RS_2018-08.xz", name = "2018-08"),
+    (file = "RS_2018-09.xz", name = "2018-09"),
+    (file = "RS_2018-10.xz", name = "2018-10"),
+    (file = "RS_2018-11.zst", name = "2018-11"),
+    (file = "RS_2018-12.zst", name = "2018-12"),
+    (file = "RS_2019-01.zst", name = "2019-01"),
+    (file = "RS_2019-02.zst", name = "2019-02"),
+    (file = "RS_2019-03.zst", name = "2019-03"),
+    (file = "RS_2019-04.zst", name = "2019-04"),
+    (file = "RS_2019-05.zst", name = "2019-05"),
+    (file = "RS_2019-06.zst", name = "2019-06"),
+    (file = "RS_2019-07.zst", name = "2019-07"),
+    (file = "RS_2019-08.zst", name = "2019-08"),
+    (file = "RS_2019-09.zst", name = "2019-09"),
+    (file = "RS_2019-10.zst", name = "2019-10"),
+    (file = "RS_2019-11.zst", name = "2019-11"),
+    (file = "RS_2019-12.zst", name = "2019-12"),
+    (file = "RS_2020-01.zst", name = "2020-01"),
+    (file = "RS_2020-02.zst", name = "2020-02"),
+    (file = "RS_2020-03.zst", name = "2020-03"),
+    (file = "RS_2020-04.zst", name = "2020-04"),
+]
+
+
+function init_deps()
+    for comment in comments_metadata
+        register(DataDep(
+            "reddit-comments-$(comment.name)",
+            """
+    Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). 
+    The pushshift reddit dataset.
+    In Proceedings of the International AAAI Conference on Web and Social Media 
+    Vol. 14, pp. 830-839).
+    """,
+            "https://files.pushshift.io/reddit/comments/$(comment.file)",
+        ))
+    end
+    for submission in submissions_metadata
+        register(DataDep(
+            "reddit-submissions-$(submission.name)",
+            """
+    Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). 
+    The pushshift reddit dataset.
+    In Proceedings of the International AAAI Conference on Web and Social Media 
+    Vol. 14, pp. 830-839).
+    """,
+            "https://files.pushshift.io/reddit/submissions/$(submission.file)",
+        ))
+    end
+
+    register(DataDep(
+        "reddit-comments-sample",
+        """
+Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). 
+The pushshift reddit dataset.
+In Proceedings of the International AAAI Conference on Web and Social Media 
+Vol. 14, pp. 830-839).
+""",
+        "https://files.pushshift.io/reddit/comments/sample_data.json",
+    ))
+
+    register(DataDep(
+        "reddit-submissions-sample",
+        """
+    Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). 
+    The pushshift reddit dataset.
+    In Proceedings of the International AAAI Conference on Web and Social Media 
+    Vol. 14, pp. 830-839).
+    """,
+        "https://files.pushshift.io/reddit/submissions/sample.json",
+    ))
+end
diff --git a/src/distiller.jl b/src/distiller.jl
new file mode 100644
index 0000000..6ff48ea
--- /dev/null
+++ b/src/distiller.jl
@@ -0,0 +1,41 @@
+using JSON3
+using ProgressBars
+# using DataFrames
+# using CSV
+
+function distill(path::String, datafilter::RedditDataFilter)
+    file = getfile(path)
+    data_vector = Vector{Dict{Symbol,Any}}
+    multithreaded = Threads.nthreads() > 1
+    values = multithreaded ? [data_vector() for t = 1:Threads.nthreads()] : data_vector()
+    decompressor_stream = get_decompressor_stream(file)
+    open(decompressor_stream, file) do stream
+        bar = ProgressBar(eachline(stream))
+        bar.description = "File: $(basename(file)); Threads: $(Threads.nthreads())"
+        for line in bar
+            if multithreaded
+                Threads.@spawn begin
+                    if contains(line, datafilter)
+                        linedata = JSON3.read(line)
+                        if contains(linedata, datafilter)
+                            data = convert(Dict, linedata)
+                            data = filter(datafilter, data)
+                            fetched_data = fetch(data)
+                            push!(values[Threads.threadid()], fetched_data)
+                        end
+                    end
+                end
+            else
+                if contains(line, datafilter)
+                    linedata = JSON3.read(line)
+                    if contains(linedata, datafilter)
+                        data = convert(Dict, linedata)
+                        data = filter(datafilter, data)
+                        push!(values, data)
+                    end
+                end
+            end
+        end
+    end
+    return multithreaded ? collect(Iterators.flatten(values)) : values
+end
diff --git a/src/filter.jl b/src/filter.jl
new file mode 100644
index 0000000..879821e
--- /dev/null
+++ b/src/filter.jl
@@ -0,0 +1,30 @@
+linevalue(key::String, value::String) = "\"$key\":\"$value\""
+
+function Base.contains(line::String, datafilter::RedditDataFilter)
+    if isempty(datafilter.author) && isempty(datafilter.author)
+        return true
+    else
+        authorcheck = any(contains.(line, linevalue.("author", datafilter.author)))
+        subredditcheck = any(contains.(line, linevalue.("subreddit", datafilter.subreddit)))
+        return any([authorcheck, subredditcheck])
+    end
+end
+
+function Base.contains(json::JSON3.Object, datafilter::RedditDataFilter)
+    if isempty(datafilter.author) && isempty(datafilter.author)
+        return true
+    else
+        authorcheck = get(json, :author, nothing) ∈ datafilter.author
+        subredditcheck = get(json, :subreddit, nothing) ∈ datafilter.subreddit
+        return any([authorcheck, subredditcheck])
+    end
+end
+
+function Base.filter(datafilter::RedditDataFilter, data::Dict{Symbol,Any})
+    if isempty(datafilter.fields)
+        return data
+    else
+        incols(x) = x ∈ Symbol.(datafilter.fields)
+        return filter(d -> incols(d.first), data)
+    end
+end
diff --git a/src/types.jl b/src/types.jl
new file mode 100644
index 0000000..136f94c
--- /dev/null
+++ b/src/types.jl
@@ -0,0 +1,25 @@
+using Parameters
+
+@with_kw struct RedditDataFilter
+    fields::Vector{String} = String[]
+    author::Vector{String} = String[]
+    subreddit::Vector{String} = String[]
+end
+
+
+RedditDataFilter(fields::String, author::Vector{String}, subreddit::Vector{String}) =
+    RedditDataFilter(fields = [fields], author = author, subreddit = subreddit)
+RedditDataFilter(fields::Vector{String}, author::String, subreddit::Vector{String}) =
+    RedditDataFilter(fields = fields, author = [author], subreddit = subreddit)
+RedditDataFilter(fields::Vector{String}, author::Vector{String}, subreddit::String) =
+    RedditDataFilter(fields = fields, author = author, subreddit = [subreddit])
+
+RedditDataFilter(fields::String, author::String, subreddit::Vector{String}) =
+    RedditDataFilter(fields = [fields], author = [author], subreddit = subreddit)
+RedditDataFilter(fields::String, author::Vector{String}, subreddit::String) =
+    RedditDataFilter(fields = [fields], author = author, subreddit = [subreddit])
+RedditDataFilter(fields::Vector{String}, author::String, subreddit::String) =
+    RedditDataFilter(fields = fields, author = [author], subreddit = [subreddit])
+
+RedditDataFilter(fields::String, author::String, subreddit::String) =
+    RedditDataFilter(fields = [fields], author = [author], subreddit = [subreddit])
diff --git a/src/utils.jl b/src/utils.jl
new file mode 100644
index 0000000..764279c
--- /dev/null
+++ b/src/utils.jl
@@ -0,0 +1,36 @@
+using CodecBzip2
+using CodecZstd
+using CodecXz
+using CodecZlib
+using TranscodingStreams
+
+decompression_codecs = Dict(
+    "bz2" => Bzip2DecompressorStream,
+    "zst" => ZstdDecompressorStream,
+    "xz" => XzDecompressorStream,
+    "gz" => GzipDecompressorStream,
+)
+
+compression_codecs = Dict(
+    "bz2" => Bzip2CompressorStream,
+    "zst" => ZstdCompressorStream,
+    "xz" => XzCompressorStream,
+    "gz" => GzipCompressorStream,
+)
+
+get_decompressor_stream(filename::String) =
+    get(decompression_codecs, split(filename, ".") |> last, NoopStream)
+get_compressor_stream(filename::String) =
+    get(compression_codecs, split(filename, ".") |> last, NoopStream)
+
+function getfile(path::String)
+    if isdir(path)
+        files = readdir(path, join = true)
+        if length(files) > 1
+            @warn "More than one file in directory ($path). Using first file."
+        end
+        return files |> first
+    else
+        return path
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..23a2a32
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,19 @@
+using PushshiftRedditDistiller
+using Test
+using DataDeps
+
+ENV["DATADEPS_ALWAYS_ACCEPT"] = true
+ENV["CI"] = true
+
+@testset "PushshiftRedditDistiller.jl" begin
+    filter = RedditDataFilter()
+    @test isempty(filter.author)
+    @test isempty(filter.subreddit)
+    @test isempty(filter.fields)
+
+    sample_submissions = distill(datadep"reddit-submissions-sample", filter)
+    @test length(sample_submissions) == 1000
+
+    sample_comments = distill(datadep"reddit-comments-sample", filter)
+    @test length(sample_comments) == 10000
+end
\ No newline at end of file