From bad2c86d4aa61db8f3d45924bb0036be630f2ec4 Mon Sep 17 00:00:00 2001 From: Peter B <5107405+pmbaumgartner@users.noreply.github.com> Date: Thu, 29 Oct 2020 16:28:43 -0400 Subject: [PATCH] first commit --- .github/workflows/CompatHelper.yml | 16 ++ .github/workflows/TagBot.yml | 13 + .gitignore | 24 ++ .travis.yml | 28 ++ LICENSE | 21 ++ Project.toml | 24 ++ README.md | 160 ++++++++++++ src/PushshiftRedditDistiller.jl | 18 ++ src/deps.jl | 403 +++++++++++++++++++++++++++++ src/distiller.jl | 41 +++ src/filter.jl | 30 +++ src/types.jl | 25 ++ src/utils.jl | 36 +++ test/runtests.jl | 19 ++ 14 files changed, 858 insertions(+) create mode 100644 .github/workflows/CompatHelper.yml create mode 100644 .github/workflows/TagBot.yml create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 Project.toml create mode 100644 README.md create mode 100644 src/PushshiftRedditDistiller.jl create mode 100644 src/deps.jl create mode 100644 src/distiller.jl create mode 100644 src/filter.jl create mode 100644 src/types.jl create mode 100644 src/utils.jl create mode 100644 test/runtests.jl diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..cba9134 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..e72d645 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,13 @@ +name: TagBot +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..29126e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest.toml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e4d4661 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +# Documentation: http://docs.travis-ci.com/user/languages/julia +language: julia +notifications: + email: false +julia: + - 1.0 + - 1.5 + - nightly +os: + - linux + - osx + - windows +arch: + - x64 +cache: + directories: + - ~/.julia/artifacts +jobs: + fast_finish: true + allow_failures: + - julia: nightly +after_success: + - | + julia -e ' + using Pkg + Pkg.add("Coverage") + using Coverage + Codecov.submit(process_folder())' diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9cdcfba --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 RTI International and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..e354c01 --- /dev/null +++ b/Project.toml @@ -0,0 +1,24 @@ +name = "PushshiftRedditDistiller" +uuid = "860182d5-2f7f-4b01-a084-d8f9ccad7d63" +authors = ["Peter B <5107405+pmbaumgartner@users.noreply.github.com> and contributors"] +version = "0.1.0" + +[deps] +CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" +CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" +TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" + +[compat] +julia = "1" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..d5bb821 --- /dev/null +++ b/README.md @@ -0,0 +1,160 @@ +# PushshiftRedditDistiller + +This package is intended to assist with downloading, extracting, and distilling the monthly reddit data dumps made available through [pushshift.io](https://files.pushshift.io/reddit/). + +# Example Use + +## Preexisting File +If you already have a monthly submissions or comments file downloaded: + +```julia +julia> using PushshiftRedditDistiller + +julia> filter = RedditDataFilter(author=["spez"]) + +julia> spez_comments = distill("~/Downloads/RC_2005-12.bz2", filter) + +julia> length(spez_comments) +7 + +julia> typeof(spez_comments) +Array{Dict{Symbol,Any},1} + +julia> first(spez_comments) +Dict{Symbol,Any} with 18 entries: + :author_flair_css_class => nothing + :gilded => 0 + :parent_id => "t3_17942" + :score => 4 + :link_id => "t3_17942" + :created_utc => 1134392748 + :author_flair_text => nothing + :distinguished => nothing + :author => "spez" + :stickied => false + :subreddit => "reddit.com" + :subreddit_id => "t5_6" + :id => "c53" + :retrieved_on => 1473738414 + :body => "still looks like a death trap to me..." + :controversiality => 0 + :ups => 4 + :edited => false +``` + +## DataDeps Catalog + +All monthly comment and submissions files are cataloged and available using [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl). The format for the `datadep` string macros are `reddit-comments-YYYY-MM` for comments and `reddit-submissions-YYYY-MM` for submissions. + +If the file isn't downloaded, you will be prompted to download that archive file before processing. + +```julia +julia> using DataDeps + +julia> more_spez_comments = distill(datadep"reddit-comments-2006-04", filter) +This program has requested access to the data dependency reddit-comments-2006-04. +which is not currently installed. It can be installed automatically, and you will not see this message again. + +Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). +The pushshift reddit dataset. +In Proceedings of the International AAAI Conference on Web and Social Media +(Vol. 14, pp. 830-839). + + + +Do you want to download the dataset from https://files.pushshift.io/reddit/comments/RC_2006-04.bz2 to "~/.julia/datadeps/reddit-comments-2006-04"? +[y/n] +y + +┌ Info: Downloading +│ source = "https://files.pushshift.io/reddit/comments/RC_2006-04.bz2" +│ dest = "~/.julia/datadeps/reddit-comments-2006-04/RC_2006-04.bz2" +│ progress = 1.0 +│ time_taken = "0.51 s" +│ time_remaining = "0.0 s" +│ average_speed = "3.729 MiB/s" +│ downloaded = "1.891 MiB" +│ remaining = "0 bytes" +└ total = "1.891 MiB" +┌ Warning: Checksum not provided, add to the Datadep Registration the following hash line +│ hash = "1e757b8a7dd4b1f7281329ac77cf4a20f59571d59899983fd7f347b24b081516" + +julia> length(more_spez_comments) +19 +``` + +## Multithreaded Support + +If you start Julia with more than one thread available, multithreading will be enabled. It's best to play around with the number of threads for a bit if you're looking to optimize parsing speed, as it depends on how complex your filter is and the decompression algorithm available. For example, slower decompression algorithms (`bz2`) will bottleneck the speed in which you can feed lines from the stream to each thread to parse. For faster algorithms, you may have a strict filter that doesn't result in many lines needing to be fully parsed, making the overhead of coordinating threads costly. + +The number of threads used while parsing will be displayed in the progress bar. + +```bash +$ julia -t 3 +``` +... + +```julia +julia> results = distill(datadep"reddit-comments-2006-04", filter) +File: RC_2006-04.bz2; Threads: 3 ┣ ╱ ╱ ╱ ╱ ╱ ╱ ╱ ╱ ╱ ┫ 19090it 00:01 [35469.7 it/s] +``` + + +## Filtering with `RedditDataFilter` + +Data can be filtered on the `author` or `subreddit` field currently. The filtering is currently disjunctive (OR), so if both `author` and `subreddit` are passed, it will return data from those author(s) OR those subreddit(s). + +In addition, you can control which fields are returned with the `fields` argument. + +All arguments are of type `Vector{String}`, though passing a single string to an argument will convert it to a length 1 `Vector`. + +**Note that no checking of correct field names is done for you, since the fields available change over time** + +``` +julia> using Dates + +julia> timestamps_only = RedditDataFilter(fields=["created_utc"]) + +julia> timestamp_comments = distill(datadep"reddit-comments-2006-04", timestamps_only) + +julia> Dates.unix2datetime(first(timestamp_comments)[:created_utc]) +2006-04-01T00:00:55 +``` + +## Other Usage Notes + +### Exporting + +`distill` returns `Vector{Dict{Symbol,Any}}`, which can be fed into a `DataFrame` from [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) (not included). + +```julia +julia> using DataFrames, CSV + +julia> spez_comments_df = DataFrame(spez_comments) + +julia> CSV.write("spez_comments.csv", spez_comments_df, quotestrings=true) +``` + +### Managing DataDeps + +Some of the files are large - if you were to download the whole archive it would be over one TB. Because of this, you may want to remove a file after use or change your DataDeps download directory to another drive. + +**Removal** + +```julia +julia> rm(datadep"reddit-comments-2006-04", recursive=true) +``` + +**New Directory** + +```julia +julia> download_path = "/Users/user/pushshift-datadeps" + +julia> mkdir(download_path) + +julia> ENV["DATADEPS_LOAD_PATH"] = download_path + +julia> ENV["DATADEPS_NO_STANDARD_LOAD_PATH"] = true +``` + + diff --git a/src/PushshiftRedditDistiller.jl b/src/PushshiftRedditDistiller.jl new file mode 100644 index 0000000..5a661de --- /dev/null +++ b/src/PushshiftRedditDistiller.jl @@ -0,0 +1,18 @@ +module PushshiftRedditDistiller + +using DataDeps + +export RedditDataFilter, distill + +include("deps.jl") +include("types.jl") +include("distiller.jl") +include("filter.jl") +include("utils.jl") + +function __init__() + init_deps() +end + + +end diff --git a/src/deps.jl b/src/deps.jl new file mode 100644 index 0000000..fb13e36 --- /dev/null +++ b/src/deps.jl @@ -0,0 +1,403 @@ +using DataDeps + +comments_metadata = [ + (file = "RC_2005-12.bz2", name = "2005-12"), + (file = "RC_2006-01.bz2", name = "2006-01"), + (file = "RC_2006-02.bz2", name = "2006-02"), + (file = "RC_2006-03.bz2", name = "2006-03"), + (file = "RC_2006-04.bz2", name = "2006-04"), + (file = "RC_2006-05.bz2", name = "2006-05"), + (file = "RC_2006-06.bz2", name = "2006-06"), + (file = "RC_2006-07.bz2", name = "2006-07"), + (file = "RC_2006-08.bz2", name = "2006-08"), + (file = "RC_2006-09.bz2", name = "2006-09"), + (file = "RC_2006-10.bz2", name = "2006-10"), + (file = "RC_2006-11.bz2", name = "2006-11"), + (file = "RC_2006-12.bz2", name = "2006-12"), + (file = "RC_2007-01.bz2", name = "2007-01"), + (file = "RC_2007-02.bz2", name = "2007-02"), + (file = "RC_2007-03.bz2", name = "2007-03"), + (file = "RC_2007-04.bz2", name = "2007-04"), + (file = "RC_2007-05.bz2", name = "2007-05"), + (file = "RC_2007-06.bz2", name = "2007-06"), + (file = "RC_2007-07.bz2", name = "2007-07"), + (file = "RC_2007-08.bz2", name = "2007-08"), + (file = "RC_2007-09.bz2", name = "2007-09"), + (file = "RC_2007-10.bz2", name = "2007-10"), + (file = "RC_2007-11.bz2", name = "2007-11"), + (file = "RC_2007-12.bz2", name = "2007-12"), + (file = "RC_2008-01.bz2", name = "2008-01"), + (file = "RC_2008-02.bz2", name = "2008-02"), + (file = "RC_2008-03.bz2", name = "2008-03"), + (file = "RC_2008-04.bz2", name = "2008-04"), + (file = "RC_2008-05.bz2", name = "2008-05"), + (file = "RC_2008-06.bz2", name = "2008-06"), + (file = "RC_2008-07.bz2", name = "2008-07"), + (file = "RC_2008-08.bz2", name = "2008-08"), + (file = "RC_2008-09.bz2", name = "2008-09"), + (file = "RC_2008-10.bz2", name = "2008-10"), + (file = "RC_2008-11.bz2", name = "2008-11"), + (file = "RC_2008-12.bz2", name = "2008-12"), + (file = "RC_2009-01.bz2", name = "2009-01"), + (file = "RC_2009-02.bz2", name = "2009-02"), + (file = "RC_2009-03.bz2", name = "2009-03"), + (file = "RC_2009-04.bz2", name = "2009-04"), + (file = "RC_2009-05.bz2", name = "2009-05"), + (file = "RC_2009-06.bz2", name = "2009-06"), + (file = "RC_2009-07.bz2", name = "2009-07"), + (file = "RC_2009-08.bz2", name = "2009-08"), + (file = "RC_2009-09.bz2", name = "2009-09"), + (file = "RC_2009-10.bz2", name = "2009-10"), + (file = "RC_2009-11.bz2", name = "2009-11"), + (file = "RC_2009-12.bz2", name = "2009-12"), + (file = "RC_2010-01.bz2", name = "2010-01"), + (file = "RC_2010-02.bz2", name = "2010-02"), + (file = "RC_2010-03.bz2", name = "2010-03"), + (file = "RC_2010-04.bz2", name = "2010-04"), + (file = "RC_2010-05.bz2", name = "2010-05"), + (file = "RC_2010-06.bz2", name = "2010-06"), + (file = "RC_2010-07.bz2", name = "2010-07"), + (file = "RC_2010-08.bz2", name = "2010-08"), + (file = "RC_2010-09.bz2", name = "2010-09"), + (file = "RC_2010-10.bz2", name = "2010-10"), + (file = "RC_2010-11.bz2", name = "2010-11"), + (file = "RC_2010-12.bz2", name = "2010-12"), + (file = "RC_2011-01.bz2", name = "2011-01"), + (file = "RC_2011-02.bz2", name = "2011-02"), + (file = "RC_2011-03.bz2", name = "2011-03"), + (file = "RC_2011-04.bz2", name = "2011-04"), + (file = "RC_2011-05.bz2", name = "2011-05"), + (file = "RC_2011-06.bz2", name = "2011-06"), + (file = "RC_2011-07.bz2", name = "2011-07"), + (file = "RC_2011-08.bz2", name = "2011-08"), + (file = "RC_2011-09.bz2", name = "2011-09"), + (file = "RC_2011-10.bz2", name = "2011-10"), + (file = "RC_2011-11.bz2", name = "2011-11"), + (file = "RC_2011-12.bz2", name = "2011-12"), + (file = "RC_2012-01.bz2", name = "2012-01"), + (file = "RC_2012-02.bz2", name = "2012-02"), + (file = "RC_2012-03.bz2", name = "2012-03"), + (file = "RC_2012-04.bz2", name = "2012-04"), + (file = "RC_2012-05.bz2", name = "2012-05"), + (file = "RC_2012-06.bz2", name = "2012-06"), + (file = "RC_2012-07.bz2", name = "2012-07"), + (file = "RC_2012-08.bz2", name = "2012-08"), + (file = "RC_2012-09.bz2", name = "2012-09"), + (file = "RC_2012-10.bz2", name = "2012-10"), + (file = "RC_2012-11.bz2", name = "2012-11"), + (file = "RC_2012-12.bz2", name = "2012-12"), + (file = "RC_2013-01.bz2", name = "2013-01"), + (file = "RC_2013-02.bz2", name = "2013-02"), + (file = "RC_2013-03.bz2", name = "2013-03"), + (file = "RC_2013-04.bz2", name = "2013-04"), + (file = "RC_2013-05.bz2", name = "2013-05"), + (file = "RC_2013-06.bz2", name = "2013-06"), + (file = "RC_2013-07.bz2", name = "2013-07"), + (file = "RC_2013-08.bz2", name = "2013-08"), + (file = "RC_2013-09.bz2", name = "2013-09"), + (file = "RC_2013-10.bz2", name = "2013-10"), + (file = "RC_2013-11.bz2", name = "2013-11"), + (file = "RC_2013-12.bz2", name = "2013-12"), + (file = "RC_2014-01.bz2", name = "2014-01"), + (file = "RC_2014-02.bz2", name = "2014-02"), + (file = "RC_2014-03.bz2", name = "2014-03"), + (file = "RC_2014-04.bz2", name = "2014-04"), + (file = "RC_2014-05.bz2", name = "2014-05"), + (file = "RC_2014-06.bz2", name = "2014-06"), + (file = "RC_2014-07.bz2", name = "2014-07"), + (file = "RC_2014-08.bz2", name = "2014-08"), + (file = "RC_2014-09.bz2", name = "2014-09"), + (file = "RC_2014-10.bz2", name = "2014-10"), + (file = "RC_2014-11.bz2", name = "2014-11"), + (file = "RC_2014-12.bz2", name = "2014-12"), + (file = "RC_2015-01.bz2", name = "2015-01"), + (file = "RC_2015-02.bz2", name = "2015-02"), + (file = "RC_2015-03.bz2", name = "2015-03"), + (file = "RC_2015-04.bz2", name = "2015-04"), + (file = "RC_2015-05.bz2", name = "2015-05"), + (file = "RC_2015-06.bz2", name = "2015-06"), + (file = "RC_2015-07.bz2", name = "2015-07"), + (file = "RC_2015-08.bz2", name = "2015-08"), + (file = "RC_2015-09.bz2", name = "2015-09"), + (file = "RC_2015-10.bz2", name = "2015-10"), + (file = "RC_2015-11.bz2", name = "2015-11"), + (file = "RC_2015-12.bz2", name = "2015-12"), + (file = "RC_2016-01.bz2", name = "2016-01"), + (file = "RC_2016-02.bz2", name = "2016-02"), + (file = "RC_2016-03.bz2", name = "2016-03"), + (file = "RC_2016-04.bz2", name = "2016-04"), + (file = "RC_2016-05.bz2", name = "2016-05"), + (file = "RC_2016-06.bz2", name = "2016-06"), + (file = "RC_2016-07.bz2", name = "2016-07"), + (file = "RC_2016-08.bz2", name = "2016-08"), + (file = "RC_2016-09.bz2", name = "2016-09"), + (file = "RC_2016-10.bz2", name = "2016-10"), + (file = "RC_2016-11.bz2", name = "2016-11"), + (file = "RC_2016-12.bz2", name = "2016-12"), + (file = "RC_2017-01.bz2", name = "2017-01"), + (file = "RC_2017-02.bz2", name = "2017-02"), + (file = "RC_2017-03.bz2", name = "2017-03"), + (file = "RC_2017-04.bz2", name = "2017-04"), + (file = "RC_2017-05.bz2", name = "2017-05"), + (file = "RC_2017-06.bz2", name = "2017-06"), + (file = "RC_2017-07.bz2", name = "2017-07"), + (file = "RC_2017-08.bz2", name = "2017-08"), + (file = "RC_2017-09.bz2", name = "2017-09"), + (file = "RC_2017-10.bz2", name = "2017-10"), + (file = "RC_2017-11.bz2", name = "2017-11"), + (file = "RC_2017-12.xz", name = "2017-12"), + (file = "RC_2018-01.xz", name = "2018-01"), + (file = "RC_2018-02.xz", name = "2018-02"), + (file = "RC_2018-03.xz", name = "2018-03"), + (file = "RC_2018-04.xz", name = "2018-04"), + (file = "RC_2018-05.xz", name = "2018-05"), + (file = "RC_2018-06.xz", name = "2018-06"), + (file = "RC_2018-07.xz", name = "2018-07"), + (file = "RC_2018-08.xz", name = "2018-08"), + (file = "RC_2018-09.xz", name = "2018-09"), + (file = "RC_2018-10.zst", name = "2018-10"), + (file = "RC_2018-11.zst", name = "2018-11"), + (file = "RC_2018-12.zst", name = "2018-12"), + (file = "RC_2019-01.zst", name = "2019-01"), + (file = "RC_2019-02.zst", name = "2019-02"), + (file = "RC_2019-03.zst", name = "2019-03"), + (file = "RC_2019-04.zst", name = "2019-04"), + (file = "RC_2019-05.zst", name = "2019-05"), + (file = "RC_2019-06.zst", name = "2019-06"), + (file = "RC_2019-07.zst", name = "2019-07"), + (file = "RC_2019-08.zst", name = "2019-08"), + (file = "RC_2019-09.zst", name = "2019-09"), + (file = "RC_2019-10.zst", name = "2019-10"), + (file = "RC_2019-11.zst", name = "2019-11"), + (file = "RC_2019-12.zst", name = "2019-12"), +] + +submissions_metadata = [ + (file = "RS_v2_2005-06.xz", name = "2005-06"), + (file = "RS_v2_2005-07.xz", name = "2005-07"), + (file = "RS_v2_2005-08.xz", name = "2005-08"), + (file = "RS_v2_2005-09.xz", name = "2005-09"), + (file = "RS_v2_2005-10.xz", name = "2005-10"), + (file = "RS_v2_2005-11.xz", name = "2005-11"), + (file = "RS_v2_2005-12.xz", name = "2005-12"), + (file = "RS_v2_2006-01.xz", name = "2006-01"), + (file = "RS_v2_2006-02.xz", name = "2006-02"), + (file = "RS_v2_2006-03.xz", name = "2006-03"), + (file = "RS_v2_2006-04.xz", name = "2006-04"), + (file = "RS_v2_2006-05.xz", name = "2006-05"), + (file = "RS_v2_2006-06.xz", name = "2006-06"), + (file = "RS_v2_2006-07.xz", name = "2006-07"), + (file = "RS_v2_2006-08.xz", name = "2006-08"), + (file = "RS_v2_2006-09.xz", name = "2006-09"), + (file = "RS_v2_2006-10.xz", name = "2006-10"), + (file = "RS_v2_2006-11.xz", name = "2006-11"), + (file = "RS_v2_2006-12.xz", name = "2006-12"), + (file = "RS_v2_2007-01.xz", name = "2007-01"), + (file = "RS_v2_2007-02.xz", name = "2007-02"), + (file = "RS_v2_2007-03.xz", name = "2007-03"), + (file = "RS_v2_2007-04.xz", name = "2007-04"), + (file = "RS_v2_2007-05.xz", name = "2007-05"), + (file = "RS_v2_2007-06.xz", name = "2007-06"), + (file = "RS_v2_2007-07.xz", name = "2007-07"), + (file = "RS_v2_2007-08.xz", name = "2007-08"), + (file = "RS_v2_2007-09.xz", name = "2007-09"), + (file = "RS_v2_2007-10.xz", name = "2007-10"), + (file = "RS_v2_2007-11.xz", name = "2007-11"), + (file = "RS_v2_2007-12.xz", name = "2007-12"), + (file = "RS_v2_2008-02.xz", name = "2008-02"), + (file = "RS_v2_2008-03.xz", name = "2008-03"), + (file = "RS_v2_2008-04.xz", name = "2008-04"), + (file = "RS_v2_2008-05.xz", name = "2008-05"), + (file = "RS_v2_2008-07.xz", name = "2008-07"), + (file = "RS_v2_2008-08.xz", name = "2008-08"), + (file = "RS_v2_2008-09.xz", name = "2008-09"), + (file = "RS_v2_2008-10.xz", name = "2008-10"), + (file = "RS_v2_2008-11.xz", name = "2008-11"), + (file = "RS_v2_2008-12.xz", name = "2008-12"), + (file = "RS_v2_2009-01.xz", name = "2009-01"), + (file = "RS_v2_2009-02.xz", name = "2009-02"), + (file = "RS_v2_2009-03.xz", name = "2009-03"), + (file = "RS_v2_2009-04.xz", name = "2009-04"), + (file = "RS_v2_2009-05.xz", name = "2009-05"), + (file = "RS_v2_2009-06.xz", name = "2009-06"), + (file = "RS_v2_2009-07.xz", name = "2009-07"), + (file = "RS_v2_2009-08.xz", name = "2009-08"), + (file = "RS_v2_2009-09.xz", name = "2009-09"), + (file = "RS_v2_2009-10.xz", name = "2009-10"), + (file = "RS_v2_2009-11.xz", name = "2009-11"), + (file = "RS_v2_2009-12.xz", name = "2009-12"), + (file = "RS_v2_2010-01.xz", name = "2010-01"), + (file = "RS_v2_2010-02.xz", name = "2010-02"), + (file = "RS_v2_2010-03.xz", name = "2010-03"), + (file = "RS_v2_2010-04.xz", name = "2010-04"), + (file = "RS_v2_2010-05.xz", name = "2010-05"), + (file = "RS_v2_2010-06.xz", name = "2010-06"), + (file = "RS_v2_2010-07.xz", name = "2010-07"), + (file = "RS_v2_2010-08.xz", name = "2010-08"), + (file = "RS_v2_2010-09.xz", name = "2010-09"), + (file = "RS_v2_2010-10.xz", name = "2010-10"), + (file = "RS_v2_2010-11.xz", name = "2010-11"), + (file = "RS_v2_2010-12.xz", name = "2010-12"), + (file = "RS_2011-01.bz2", name = "2011-01"), + (file = "RS_2011-02.bz2", name = "2011-02"), + (file = "RS_2011-03.bz2", name = "2011-03"), + (file = "RS_2011-04.bz2", name = "2011-04"), + (file = "RS_2011-05.bz2", name = "2011-05"), + (file = "RS_2011-06.bz2", name = "2011-06"), + (file = "RS_2011-07.bz2", name = "2011-07"), + (file = "RS_2011-08.bz2", name = "2011-08"), + (file = "RS_2011-09.bz2", name = "2011-09"), + (file = "RS_2011-10.bz2", name = "2011-10"), + (file = "RS_2011-11.bz2", name = "2011-11"), + (file = "RS_2011-12.bz2", name = "2011-12"), + (file = "RS_2012-01.bz2", name = "2012-01"), + (file = "RS_2012-02.bz2", name = "2012-02"), + (file = "RS_2012-03.bz2", name = "2012-03"), + (file = "RS_2012-04.bz2", name = "2012-04"), + (file = "RS_2012-05.bz2", name = "2012-05"), + (file = "RS_2012-06.bz2", name = "2012-06"), + (file = "RS_2012-07.bz2", name = "2012-07"), + (file = "RS_2012-08.bz2", name = "2012-08"), + (file = "RS_2012-09.bz2", name = "2012-09"), + (file = "RS_2012-10.bz2", name = "2012-10"), + (file = "RS_2012-11.bz2", name = "2012-11"), + (file = "RS_2012-12.bz2", name = "2012-12"), + (file = "RS_2013-01.bz2", name = "2013-01"), + (file = "RS_2013-02.bz2", name = "2013-02"), + (file = "RS_2013-03.bz2", name = "2013-03"), + (file = "RS_2013-04.bz2", name = "2013-04"), + (file = "RS_2013-05.bz2", name = "2013-05"), + (file = "RS_2013-06.bz2", name = "2013-06"), + (file = "RS_2013-07.bz2", name = "2013-07"), + (file = "RS_2013-08.bz2", name = "2013-08"), + (file = "RS_2013-09.bz2", name = "2013-09"), + (file = "RS_2013-10.bz2", name = "2013-10"), + (file = "RS_2013-11.bz2", name = "2013-11"), + (file = "RS_2013-12.bz2", name = "2013-12"), + (file = "RS_2014-01.bz2", name = "2014-01"), + (file = "RS_2014-02.bz2", name = "2014-02"), + (file = "RS_2014-03.bz2", name = "2014-03"), + (file = "RS_2014-04.bz2", name = "2014-04"), + (file = "RS_2014-05.bz2", name = "2014-05"), + (file = "RS_2014-06.bz2", name = "2014-06"), + (file = "RS_2014-07.bz2", name = "2014-07"), + (file = "RS_2014-08.bz2", name = "2014-08"), + (file = "RS_2014-09.bz2", name = "2014-09"), + (file = "RS_2014-10.bz2", name = "2014-10"), + (file = "RS_2014-11.bz2", name = "2014-11"), + (file = "RS_2014-12.bz2", name = "2014-12"), + (file = "RS_2015-01.zst", name = "2015-01"), + (file = "RS_2015-02.zst", name = "2015-02"), + (file = "RS_2015-03.zst", name = "2015-03"), + (file = "RS_2015-04.zst", name = "2015-04"), + (file = "RS_2015-05.zst", name = "2015-05"), + (file = "RS_2015-06.zst", name = "2015-06"), + (file = "RS_2015-07.zst", name = "2015-07"), + (file = "RS_2015-08.zst", name = "2015-08"), + (file = "RS_2015-09.zst", name = "2015-09"), + (file = "RS_2015-10.zst", name = "2015-10"), + (file = "RS_2015-11.zst", name = "2015-11"), + (file = "RS_2015-12.zst", name = "2015-12"), + (file = "RS_2016-01.zst", name = "2016-01"), + (file = "RS_2016-02.zst", name = "2016-02"), + (file = "RS_2016-03.zst", name = "2016-03"), + (file = "RS_2016-04.zst", name = "2016-04"), + (file = "RS_2016-05.zst", name = "2016-05"), + (file = "RS_2016-06.zst", name = "2016-06"), + (file = "RS_2016-07.zst", name = "2016-07"), + (file = "RS_2016-08.zst", name = "2016-08"), + (file = "RS_2016-09.zst", name = "2016-09"), + (file = "RS_2016-10.zst", name = "2016-10"), + (file = "RS_2016-11.zst", name = "2016-11"), + (file = "RS_2016-12.zst", name = "2016-12"), + (file = "RS_2017-01.bz2", name = "2017-01"), + (file = "RS_2017-02.bz2", name = "2017-02"), + (file = "RS_2017-03.bz2", name = "2017-03"), + (file = "RS_2017-04.bz2", name = "2017-04"), + (file = "RS_2017-05.bz2", name = "2017-05"), + (file = "RS_2017-06.bz2", name = "2017-06"), + (file = "RS_2017-07.bz2", name = "2017-07"), + (file = "RS_2017-08.bz2", name = "2017-08"), + (file = "RS_2017-09.bz2", name = "2017-09"), + (file = "RS_2017-10.bz2", name = "2017-10"), + (file = "RS_2017-11.xz", name = "2017-11"), + (file = "RS_2017-12.xz", name = "2017-12"), + (file = "RS_2018-01.xz", name = "2018-01"), + (file = "RS_2018-02.xz", name = "2018-02"), + (file = "RS_2018-03.xz", name = "2018-03"), + (file = "RS_2018-04.xz", name = "2018-04"), + (file = "RS_2018-05.xz", name = "2018-05"), + (file = "RS_2018-06.xz", name = "2018-06"), + (file = "RS_2018-07.xz", name = "2018-07"), + (file = "RS_2018-08.xz", name = "2018-08"), + (file = "RS_2018-09.xz", name = "2018-09"), + (file = "RS_2018-10.xz", name = "2018-10"), + (file = "RS_2018-11.zst", name = "2018-11"), + (file = "RS_2018-12.zst", name = "2018-12"), + (file = "RS_2019-01.zst", name = "2019-01"), + (file = "RS_2019-02.zst", name = "2019-02"), + (file = "RS_2019-03.zst", name = "2019-03"), + (file = "RS_2019-04.zst", name = "2019-04"), + (file = "RS_2019-05.zst", name = "2019-05"), + (file = "RS_2019-06.zst", name = "2019-06"), + (file = "RS_2019-07.zst", name = "2019-07"), + (file = "RS_2019-08.zst", name = "2019-08"), + (file = "RS_2019-09.zst", name = "2019-09"), + (file = "RS_2019-10.zst", name = "2019-10"), + (file = "RS_2019-11.zst", name = "2019-11"), + (file = "RS_2019-12.zst", name = "2019-12"), + (file = "RS_2020-01.zst", name = "2020-01"), + (file = "RS_2020-02.zst", name = "2020-02"), + (file = "RS_2020-03.zst", name = "2020-03"), + (file = "RS_2020-04.zst", name = "2020-04"), +] + + +function init_deps() + for comment in comments_metadata + register(DataDep( + "reddit-comments-$(comment.name)", + """ + Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). + The pushshift reddit dataset. + In Proceedings of the International AAAI Conference on Web and Social Media + Vol. 14, pp. 830-839). + """, + "https://files.pushshift.io/reddit/comments/$(comment.file)", + )) + end + for submission in submissions_metadata + register(DataDep( + "reddit-submissions-$(submission.name)", + """ + Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). + The pushshift reddit dataset. + In Proceedings of the International AAAI Conference on Web and Social Media + Vol. 14, pp. 830-839). + """, + "https://files.pushshift.io/reddit/submissions/$(submission.file)", + )) + end + + register(DataDep( + "reddit-comments-sample", + """ +Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). +The pushshift reddit dataset. +In Proceedings of the International AAAI Conference on Web and Social Media +Vol. 14, pp. 830-839). +""", + "https://files.pushshift.io/reddit/comments/sample_data.json", + )) + + register(DataDep( + "reddit-submissions-sample", + """ + Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., & Blackburn, J. (2020, May). + The pushshift reddit dataset. + In Proceedings of the International AAAI Conference on Web and Social Media + Vol. 14, pp. 830-839). + """, + "https://files.pushshift.io/reddit/submissions/sample.json", + )) +end diff --git a/src/distiller.jl b/src/distiller.jl new file mode 100644 index 0000000..6ff48ea --- /dev/null +++ b/src/distiller.jl @@ -0,0 +1,41 @@ +using JSON3 +using ProgressBars +# using DataFrames +# using CSV + +function distill(path::String, datafilter::RedditDataFilter) + file = getfile(path) + data_vector = Vector{Dict{Symbol,Any}} + multithreaded = Threads.nthreads() > 1 + values = multithreaded ? [data_vector() for t = 1:Threads.nthreads()] : data_vector() + decompressor_stream = get_decompressor_stream(file) + open(decompressor_stream, file) do stream + bar = ProgressBar(eachline(stream)) + bar.description = "File: $(basename(file)); Threads: $(Threads.nthreads())" + for line in bar + if multithreaded + Threads.@spawn begin + if contains(line, datafilter) + linedata = JSON3.read(line) + if contains(linedata, datafilter) + data = convert(Dict, linedata) + data = filter(datafilter, data) + fetched_data = fetch(data) + push!(values[Threads.threadid()], fetched_data) + end + end + end + else + if contains(line, datafilter) + linedata = JSON3.read(line) + if contains(linedata, datafilter) + data = convert(Dict, linedata) + data = filter(datafilter, data) + push!(values, data) + end + end + end + end + end + return multithreaded ? collect(Iterators.flatten(values)) : values +end diff --git a/src/filter.jl b/src/filter.jl new file mode 100644 index 0000000..879821e --- /dev/null +++ b/src/filter.jl @@ -0,0 +1,30 @@ +linevalue(key::String, value::String) = "\"$key\":\"$value\"" + +function Base.contains(line::String, datafilter::RedditDataFilter) + if isempty(datafilter.author) && isempty(datafilter.author) + return true + else + authorcheck = any(contains.(line, linevalue.("author", datafilter.author))) + subredditcheck = any(contains.(line, linevalue.("subreddit", datafilter.subreddit))) + return any([authorcheck, subredditcheck]) + end +end + +function Base.contains(json::JSON3.Object, datafilter::RedditDataFilter) + if isempty(datafilter.author) && isempty(datafilter.author) + return true + else + authorcheck = get(json, :author, nothing) ∈ datafilter.author + subredditcheck = get(json, :subreddit, nothing) ∈ datafilter.subreddit + return any([authorcheck, subredditcheck]) + end +end + +function Base.filter(datafilter::RedditDataFilter, data::Dict{Symbol,Any}) + if isempty(datafilter.fields) + return data + else + incols(x) = x ∈ Symbol.(datafilter.fields) + return filter(d -> incols(d.first), data) + end +end diff --git a/src/types.jl b/src/types.jl new file mode 100644 index 0000000..136f94c --- /dev/null +++ b/src/types.jl @@ -0,0 +1,25 @@ +using Parameters + +@with_kw struct RedditDataFilter + fields::Vector{String} = String[] + author::Vector{String} = String[] + subreddit::Vector{String} = String[] +end + + +RedditDataFilter(fields::String, author::Vector{String}, subreddit::Vector{String}) = + RedditDataFilter(fields = [fields], author = author, subreddit = subreddit) +RedditDataFilter(fields::Vector{String}, author::String, subreddit::Vector{String}) = + RedditDataFilter(fields = fields, author = [author], subreddit = subreddit) +RedditDataFilter(fields::Vector{String}, author::Vector{String}, subreddit::String) = + RedditDataFilter(fields = fields, author = author, subreddit = [subreddit]) + +RedditDataFilter(fields::String, author::String, subreddit::Vector{String}) = + RedditDataFilter(fields = [fields], author = [author], subreddit = subreddit) +RedditDataFilter(fields::String, author::Vector{String}, subreddit::String) = + RedditDataFilter(fields = [fields], author = author, subreddit = [subreddit]) +RedditDataFilter(fields::Vector{String}, author::String, subreddit::String) = + RedditDataFilter(fields = fields, author = [author], subreddit = [subreddit]) + +RedditDataFilter(fields::String, author::String, subreddit::String) = + RedditDataFilter(fields = [fields], author = [author], subreddit = [subreddit]) diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 0000000..764279c --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,36 @@ +using CodecBzip2 +using CodecZstd +using CodecXz +using CodecZlib +using TranscodingStreams + +decompression_codecs = Dict( + "bz2" => Bzip2DecompressorStream, + "zst" => ZstdDecompressorStream, + "xz" => XzDecompressorStream, + "gz" => GzipDecompressorStream, +) + +compression_codecs = Dict( + "bz2" => Bzip2CompressorStream, + "zst" => ZstdCompressorStream, + "xz" => XzCompressorStream, + "gz" => GzipCompressorStream, +) + +get_decompressor_stream(filename::String) = + get(decompression_codecs, split(filename, ".") |> last, NoopStream) +get_compressor_stream(filename::String) = + get(compression_codecs, split(filename, ".") |> last, NoopStream) + +function getfile(path::String) + if isdir(path) + files = readdir(path, join = true) + if length(files) > 1 + @warn "More than one file in directory ($path). Using first file." + end + return files |> first + else + return path + end +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..23a2a32 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,19 @@ +using PushshiftRedditDistiller +using Test +using DataDeps + +ENV["DATADEPS_ALWAYS_ACCEPT"] = true +ENV["CI"] = true + +@testset "PushshiftRedditDistiller.jl" begin + filter = RedditDataFilter() + @test isempty(filter.author) + @test isempty(filter.subreddit) + @test isempty(filter.fields) + + sample_submissions = distill(datadep"reddit-submissions-sample", filter) + @test length(sample_submissions) == 1000 + + sample_comments = distill(datadep"reddit-comments-sample", filter) + @test length(sample_comments) == 10000 +end \ No newline at end of file