diff --git a/madams56.ipynb b/madams56.ipynb new file mode 100644 index 0000000..7f9773a --- /dev/null +++ b/madams56.ipynb @@ -0,0 +1,750 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pprint\n", + "import re\n", + "import pymongo, json\n", + "\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "\n", + "client = pymongo.MongoClient(host=\"da1.eecs.utk.edu\")\n", + "db = client['fdac19mp2']\n", + "coll = db['madams56']\n", + "# for each dataset\n", + "coll.insert_one(\n", + " {'topic':'publication',\n", + " 'title': 'arXiv publications dataset with simulated citation relationships',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/', \n", + " 'description': '''arXiv publications dataset with simulated citation relationships\n", + "https://github.com/jacekmiecznikowski/neo4index App evaluates scientific reasearch impact using author-level metrics (h-index and more)\n", + "This collection contains data aquired from arXiv.org via OAI2 protocol. arXiv does not provide citations metadata so this data was pseudo-randomly simulated. We evaluated scientific reasearch impact using six popular author-level metrics: * h-index, * m quotient, * e-index, * m-index, * r-index, * ar-index.\n", + "Source https://arxiv.org/help/bulk_data (downloaded: 2018-03-23; over 1.3 million publications)''',\n", + " 'urls': ['https://figshare.com/articles/arXiv_publications_dataset_with_simulated_citation_relationships/6449756/1',\n", + " 'https://doi.org/10.6084/m9.figshare.6449756.v1']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'publication',\n", + " 'title': 'PLOS ONE publication and citation data',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'description': 'Data description provided in the enclosed UC-DASH_DataDescription_Petersen.pdf',\n", + " 'urls': ['https://dash.ucmerced.edu/stash/dataset/doi:10.6071/M39W8V',\n", + " 'https://doi.org/10.6071/M39W8V']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Social Networks',\n", + " 'license': 'http://networkrepository.com/policy.php',\n", + " 'description': 'social network data, download social network data, social graph, complex networks, digg data, livejournal graph data, slashdot data, flickr social network, signed network data, epinions trust network data, google plus data, download myspace data, linkedin data, orkut social network, karate social network, twitter follower graph, twitter-higgs, download sinaweibo graph, download friendster social network, wiki election data, youtube data, lastfm data, douban social network',\n", + " 'urls': ['http://networkrepository.com/soc.php']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social network',\n", + " 'title': 'Disasters on social media',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'description': '''To spot disasters, many institutions rely on social media. Bystanders often post about what is happening making information on social media faster and more informative than news reports. That is, if you know which posts actually are about disasters and which posts are irrelevant.\n", + "This dataset stems from the figure-eight (formally known as Crowdflower) data for everyone website.''',\n", + " 'urls': ['https://www.kaggle.com/jannesklaas/disasters-on-social-media']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Twitter Social Graph 2009',\n", + " 'license': 'License not specified',\n", + " 'description': 'The list of projects on github with the largest number of starts',\n", + " 'urls': ['https://datahub.io/dataset/twitter-social-graph-www2010',\n", + " 'https://datahub.ckan.io/is/dataset/6529915a-97e9-4a4a-9ced-677408c90679',\n", + " 'https://data.wu.ac.at/odso/datahub_io/NjUyOTkxNWEtOTdlOS00YTRhLTljZWQtNjc3NDA4YzkwNjc5']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Graph of Flickr Photo-Sharing Social Network Crawled in May 2006',\n", + " 'license': 'CC0 1.0 Universal',\n", + " 'description': 'Crawl of the Flickr photo-sharing social network from May 2006 returning a graph with 820,878 nodes and 9,837,214 edges. Dataset is distributed as a SMAT file with README file with code to read file in Python and MATLAB.',\n", + " 'urls': ['https://purr.purdue.edu/publications/1002',\n", + " 'https://search.datacite.org/works/10.4231/d39p2w550',\n", + " 'https://doi.org/10.4231/D39P2W550']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'The Marvel Universe Social Network',\n", + " 'license': 'https://creativecommons.org/licenses/by-nc-sa/4.0/',\n", + " 'description': '''The Marvel Universe\n", + "Marvel Comics, originally called Timely Comics Inc., has been publishing comic books for several decades. \"The Golden Age of Comics\" name that was given due to the popularity of the books during the first years, was later followed by a period of decline of interest in superhero stories due to World War ref. In 1961, Marvel relaunched its superhero comic books publishing line. This new era started what has been known as the Marvel Age of Comics. Characters created during this period such as Spider-Man, the Hulk, the Fantastic Four, and the X-Men, together with those created during the Golden Age such as Captain America, are known worldwide and have become cultural icons during the last decades. Later, Marvel's characters popularity has been revitalized even more due to the release of several recent movies which recreate the comic books using spectacular modern special effects. Nowadays, it is possible to access the content of the comic books via a digital platform created by Marvel, where it is possible to subscribe monthly or yearly to get access to the comics. More information about the Marvel Universe can be found here.\n", + "Content\n", + "The dataset contains heroes and comics, and the relationship between them. The dataset is divided into three files:\n", + " nodes.csv: Contains two columns (node, type), indicating the name and the type (comic, hero) of the nodes.\n", + " edges.csv: Contains two columns (hero, comic), indicating in which comics the heroes appear.\n", + " hero-edge.csv: Contains the network of heroes which appear together in the comics. This file was originally taken from http://syntagmatic.github.io/exposedata/marvel/\n", + "''',\n", + " 'urls': ['https://www.kaggle.com/csanhueza/the-marvel-universe-social-network']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Dynamic Networks',\n", + " 'license': 'http://networkrepository.com/policy.php',\n", + " 'description': 'dynamic networks, temporal graphs, download dynamic networks, download temporal graph data, graph streams, edge stream data, time-evolving graph data, proximity networks, facebook message data, facebook forum data, digg, hospital, primary school proximity network, sensor network data, enron data, escort network, bitcoin graph data, wikipedia election graph, workplace contacts, radoslaw email network, youtube-growth',\n", + " 'urls': ['http://networkrepository.com/dynamic.php']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Github interaction graphs for rich-club behavior analysis',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'description': 'Dataset containing the graphs generated using pull-requests, issues and commits of the top100 starred projects of Github in 2016. In each graph, a pair of nodes (developers/users) are connected if they interacted on the same Github project element, e.g.: in the pull-request graph, they are connected if they have handled the same PR (similar with issues and committed files for the other two types of graph). A derived graph called supergraph is loaded in the dataset: it is calculated as the union of previous 3 graph types, e.g: two users are connected in this graph if they have interacted on the same issues, or on the same pull request or if they committed to the same file. A list of the projects is available, containing both Github IDs and complete name of the repository.',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AA4IIS',\n", + " 'https://doi.org/10.7910/DVN/AA4IIS']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'publication',\n", + " 'title': 'Metadata, Title Pages, and Network Graph of the Digitized Content of the Berlin State Library (146,000 items)',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'description': '''The data set has been downloaded via the OAI-PMH endpoint of the Berlin State Library/Staatsbibliothek zu Berlin’s Digitized Collections (https://digital.staatsbibliothek-berlin.de/oai) on March 1st 2019 and converted into common tabular formats on the basis of the provided Dublin Core metadata. It contains 146,000 records. In addition to the bibliographic metadata, representative images of the works have been downloaded, resized to a 512 pixel maximum thumbnail image and saved in JPEG format. The image data is split into title pages and first pages. Title pages have been derived from structural metadata created by scan operators and librarians. If this information was not available, first pages of the media have been downloaded. In case of multi-volume media, title pages are not available. In total, 141,206 images title/first pages are available.\n", + "Furthermore, the tabular data has been cleaned and extended with geo-spatial coordinates provided by the OpenStreetMap project (https://www.openstreetmap.org). The actual data processing steps are summarized in the next section. For the sake of transparency and reproducibility, the original data taken from the OAI-PMH endpoint is still present in the table.\n", + "To conclude with, various graphs in GML file format are available that can be loaded directly into graph analysis tools such as Gephi (https://gephi.org/).\n", + "The implementation of the data processing steps (incl. graph creation) are available as a Jupyter notebook provided at https://github.com/elektrobohemian/SBBrowse2018/blob/master/DataProcessing.ipynb.''',\n", + " 'urls': ['https://figshare.com/articles/Metadata_Title_Pages_and_Network_Graph_of_the_Digitized_Content_of_the_Berlin_State_Library_146_000_items_/7797170',\n", + " 'https://zenodo.org/record/2582482',\n", + " 'https://doi.org/10.5281/zenodo.2582482']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Relato Business Graph Database',\n", + " 'license': 'CC-0',\n", + " 'description': 'This is the graph database from the startup Relato (2015-2016). It contains links between businesses pulled from the web. It contains 373,663 links between companies, of the types \"partnership\" (one company listed on another company\\'s partnership page), \"customer\" (one company listed on another company\\'s example customer page), \"competitor\" (co-bidders on AdWords above some limit), \"investment\" (a company listed on a VC\\'s website), \"supplier\" (the inverse of the \"customer\" type. This dataset was used to drive both a lead generation system where metrics on the graph fed into a classification for leads (lead/no lead) and a market visualization system (a force directed layout of markets and their segments).',\n", + " 'urls': ['https://data.world/datasyndrome/relato-business-graph-database']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Multilabel user classification using the community structure of online networks',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'description': 'We study the problem of semi-supervised, multi-label user classification of networked data in the online social platform setting. We propose a framework that combines unsupervised community extraction and supervised, community-based feature weighting before training a classifier. We introduce Approximate Regularized Commute-Time Embedding (ARCTE), an algorithm that projects the users of a social graph onto a latent space, but instead of packing the global structure into a matrix of predefined rank, as many spectral and neural representation learning methods do, it extracts local communities for all users in the graph in order to learn a sparse embedding. To this end, we employ an improvement of personalized PageRank algorithms for searching locally in each user’s graph structure. Then, we perform supervised community feature weighting in order to boost the importance of highly predictive communities. We assess our method performance on the problem of user classification by performing an extensive comparative study among various recent methods based on graph embeddings. The comparison shows that ARCTE significantly outperforms the competition in almost all cases, achieving up to 35% relative improvement compared to the second best competing method in terms of F1-score.',\n", + " 'urls': ['https://figshare.com/articles/Multilabel_user_classification_using_the_community_structure_of_online_networks/4740307/1',\n", + " 'https://doi.org/10.1371/journal.pone.0173347']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Nashville Meetup Network',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'description': '''meetup.com is a website for people organizing and attending regular or semi-regular events (\"meet-ups\"). The relationships amongst users—who goes to what meetups—are a social network, ideal for graph-based analysis.\n", + "This dataset was generated for a talk titled Principles of Network Analysis with NetworkX, embedded online here (or with notebooks, etc. on Github). It forms the basis for a series of tutorials I presented on at PyNash and PyTennessee. In them, we work through the basics of graph theory and how to use NetworkX, a popular open-source Python package. We then apply this knowledge to extract insights about the social fabric of Tennessee MeetUp groups. ''',\n", + " 'urls': ['https://www.kaggle.com/stkbailey/nashville-meetup']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Data from: Real-Time Community Detection in Full Social Networks on a Laptop',\n", + " 'license': 'NA',\n", + " 'description': 'The project analysed the performance of community detection algorithms on the Twitter social network operating on a graph compressed using minhash signatures. The data supplied gives minhash signatures of roughly 16,000 Twitter users who have been classified into 16 categories. It is described in https://arxiv.org/abs/1601.03958 and together with code at https://github.com/melifluos/LSH-community-detection allows the results within to be replicated.',\n", + " 'urls': ['https://www.narcis.nl/dataset/RecordID/oai%3Aeasy.dans.knaw.nl%3Aeasy-dataset%3A76304',\n", + " 'https://doi.org/10.17026/dans-2bc-4qgc']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'NILM',\n", + " 'title': 'Data from: NILMPEds: A Performance Evaluation Dataset for Event Detection Algorithms in Non-Intrusive Load Monitoring',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'description': 'NILMPEds (NILM Performance Evaluation dataset), is a different type of NILM dataset, in a sense that it is aimed primarily at research reproducibility with respect to the development and performance evaluation of event detection algorithms. NILMPEds contains the results of 47950 event detection models when applied to four public event detection datasets. The different parameter configuration of each model and the ground-truth data are also available. Finally, this dataset also contains the performance evaluation of each model according to 31 performance metrics.',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/BUPJNR',\n", + " 'https://doi.org/10.7910/DVN/BUPJNR']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'NILM',\n", + " 'title': 'Annotated load anomalies from the REFIT Dataset',\n", + " 'license': 'http://creativecommons.org/licenses/by/4.0',\n", + " 'description': 'This dataset was created by sifting through the REFIT dataset to detect load anomalies; the rules for labelling anomalies are described in the accompanying ICASSP\\'19 paper, which should be referenced if the dataset is used. Five of the 20 houses of the REFIT dataset were included in this dataset, as they contained the largest number of detected anomalies. These are Houses 1, 10, 16, 18 and 20. At the time of release, this is the first detailed annotated dataset of anomalies within publicly available electrical load measurements. These are real anomalies, not simulated ones and are extremely useful in understanding anomalous behaviour of electrical appliances, as measured by smart meters. Anomalous behaviour of the following appliances is included in this dataset: Refrigerator, freezer, fridge-freezer, dishwasher, washing machine, tumble dryer, electrical heater and microwave. When using this dataset, please cite the following paper: H. Rashid, V. Stankovic, L. Stankovic and P. Singh, \"Evaluation of Non-Intrusive Load Monitoring Algorithms for Appliance-level Anomaly Detection,\" Proc. IEEE 44th Int. Conf. on Acoustics, Speech and Signal Processing (ICASSP), Brighton, UK, May 2019.',\n", + " 'urls': ['https://search.datacite.org/works/10.15129/9729a2a0-11ce-4cce-b0d0-144c483fcb33',\n", + " 'https://doi.org/10.15129/9729a2a0-11ce-4cce-b0d0-144c483fcb33']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'anomaly detection',\n", + " 'title': 'Spacecraft Power Monitor, Phase I',\n", + " 'license': 'NA',\n", + " 'description': 'This SBIR Phase I project will develop the Spacecraft Power Monitor (SPM) which will use non-intrusive electrical monitoring (NEMO). NEMO transforms the power distribution network in an spacecraft into a multiple-use service, providing not only power distribution but also a diagnostic monitoring capability based on careful measurement and analysis of power usage and start up and shut down transients. In depth analysis of this data enables real time assessment of system and component functioning and identifies potential system and component faults and failutes. We will use NEMO\\'s ability to track load operation to verify that the systems and components of a spacecraft are operating properly This \"spacecraft power monitor\" or SPM, based on NEMO, will notify astronauts or ground support personnel when unexpected sequences occur. It can also generally track the health and diagnostic condition of key loads on the system. The system is light weight, small and inexpensive because the system requires only a sensor at the mains power input and uses existing power wiring to carry data. Phase I will involve ground measurements of spacecraft components. Phase II will involve measurements and analysis of an integrated system.',\n", + " 'urls': ['https://data.nasa.gov/dataset/Spacecraft-Power-Monitor-Phase-I/hw4h-ctgu',\n", + " 'https://catalog.data.gov/dataset/spacecraft-power-monitor-phase-i']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social graph',\n", + " 'title': 'Replication data for: Complying by Denying: Explaining Why States Develop Nonproliferation Export Controls',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'description': 'This archive contains replication data and the supplemental appendix for the article: Stinnett, Douglas, Bryan Early, Cale Horne, and Johannes Karreth. 2011. \"Complying by Denying: Explaining Why States Develop Nonproliferation Export Controls.\" International Studies Perspectives 12 (3), 308-326. We provide original data for the nonproliferation export control systems of 30 randomly selected countries, measured in multiple dimensions.',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/W3B61P',\n", + " 'https://doi.org/10.7910/DVN/W3B61P']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'publication',\n", + " 'title': 'U.S. Department of Energy Office of Scientific and Technical Information',\n", + " 'license': 'Public Domain',\n", + " 'description': 'The Department of Energy (DOE) Office of Scientific and Technical Information (OSTI), a unit of the Office of Science, fulfills agency-wide responsibilities to collect, preserve, and disseminate both unclassified and classified scientific and technical information (STI) emanating from DOE-funded research and development (R&D) activities at DOE national laboratories and facilities and at universities and other institutions nationwide. OSTI provides access to DOE STI through a suite of web-based, searchable discovery tools and through other commonly used search engines, offering ever-expanding sources of R&D information to DOE, the research community, and the science-attentive public.',\n", + " 'urls': ['https://www.osti.gov/', 'https://www.osti.gov/MARC/']\n", + " }\n", + ")\n", + "coll.insert_one(\n", + " {'topic':'social network',\n", + " 'title': 'Sentiment140 dataset with 1.6 million tweets',\n", + " 'license': 'NA',\n", + " 'description': 'This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .',\n", + " 'urls': ['https://www.kaggle.com/kazanova/sentiment140']\n", + " }\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "is_executing": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': ObjectId('5d93f62b9f7010afdece63de'),\n", + " 'description': 'arXiv publications dataset with simulated '\n", + " 'citation relationships\\n'\n", + " 'https://github.com/jacekmiecznikowski/neo4index '\n", + " 'App evaluates scientific reasearch impact '\n", + " 'using author-level metrics (h-index and more)\\n'\n", + " 'This collection contains data aquired from '\n", + " 'arXiv.org via OAI2 protocol. arXiv does not '\n", + " 'provide citations metadata so this data was '\n", + " 'pseudo-randomly simulated. We evaluated '\n", + " 'scientific reasearch impact using six popular '\n", + " 'author-level metrics: * h-index, * m quotient, '\n", + " '* e-index, * m-index, * r-index, * ar-index.\\n'\n", + " 'Source https://arxiv.org/help/bulk_data '\n", + " '(downloaded: 2018-03-23; over 1.3 million '\n", + " 'publications)',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'title': 'arXiv publications dataset with simulated citation '\n", + " 'relationships',\n", + " 'topic': 'publication',\n", + " 'urls': ['https://figshare.com/articles/arXiv_publications_dataset_with_simulated_citation_relationships/6449756/1',\n", + " 'https://doi.org/10.6084/m9.figshare.6449756.v1']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63df'),\n", + " 'description': 'Data description provided in the enclosed '\n", + " 'UC-DASH_DataDescription_Petersen.pdf',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'title': 'PLOS ONE publication and citation data',\n", + " 'topic': 'publication',\n", + " 'urls': ['https://dash.ucmerced.edu/stash/dataset/doi:10.6071/M39W8V',\n", + " 'https://doi.org/10.6071/M39W8V']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e0'),\n", + " 'description': 'social network data, download social network '\n", + " 'data, social graph, complex networks, digg '\n", + " 'data, livejournal graph data, slashdot data, '\n", + " 'flickr social network, signed network data, '\n", + " 'epinions trust network data, google plus data, '\n", + " 'download myspace data, linkedin data, orkut '\n", + " 'social network, karate social network, twitter '\n", + " 'follower graph, twitter-higgs, download '\n", + " 'sinaweibo graph, download friendster social '\n", + " 'network, wiki election data, youtube data, '\n", + " 'lastfm data, douban social network',\n", + " 'license': 'http://networkrepository.com/policy.php',\n", + " 'title': 'Social Networks',\n", + " 'topic': 'social graph',\n", + " 'urls': ['http://networkrepository.com/soc.php']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e1'),\n", + " 'description': 'To spot disasters, many institutions rely on '\n", + " 'social media. Bystanders often post about what '\n", + " 'is happening making information on social '\n", + " 'media faster and more informative than news '\n", + " 'reports. That is, if you know which posts '\n", + " 'actually are about disasters and which posts '\n", + " 'are irrelevant.\\n'\n", + " 'This dataset stems from the figure-eight '\n", + " '(formally known as Crowdflower) data for '\n", + " 'everyone website.',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'title': 'Disasters on social media',\n", + " 'topic': 'social network',\n", + " 'urls': ['https://www.kaggle.com/jannesklaas/disasters-on-social-media']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e2'),\n", + " 'description': 'The list of projects on github with the '\n", + " 'largest number of starts',\n", + " 'license': 'License not specified',\n", + " 'title': 'Twitter Social Graph 2009',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://datahub.io/dataset/twitter-social-graph-www2010',\n", + " 'https://datahub.ckan.io/is/dataset/6529915a-97e9-4a4a-9ced-677408c90679',\n", + " 'https://data.wu.ac.at/odso/datahub_io/NjUyOTkxNWEtOTdlOS00YTRhLTljZWQtNjc3NDA4YzkwNjc5']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e3'),\n", + " 'description': 'Crawl of the Flickr photo-sharing social '\n", + " 'network from May 2006 returning a graph with '\n", + " '820,878 nodes and 9,837,214 edges. Dataset is '\n", + " 'distributed as a SMAT file with README file '\n", + " 'with code to read file in Python and MATLAB.',\n", + " 'license': 'CC0 1.0 Universal',\n", + " 'title': 'Graph of Flickr Photo-Sharing Social Network Crawled '\n", + " 'in May 2006',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://purr.purdue.edu/publications/1002',\n", + " 'https://search.datacite.org/works/10.4231/d39p2w550',\n", + " 'https://doi.org/10.4231/D39P2W550']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e4'),\n", + " 'description': 'The Marvel Universe\\n'\n", + " 'Marvel Comics, originally called Timely Comics '\n", + " 'Inc., has been publishing comic books for '\n", + " 'several decades. \"The Golden Age of Comics\" '\n", + " 'name that was given due to the popularity of '\n", + " 'the books during the first years, was later '\n", + " 'followed by a period of decline of interest in '\n", + " 'superhero stories due to World War ref. In '\n", + " '1961, Marvel relaunched its superhero comic '\n", + " 'books publishing line. This new era started '\n", + " 'what has been known as the Marvel Age of '\n", + " 'Comics. Characters created during this period '\n", + " 'such as Spider-Man, the Hulk, the Fantastic '\n", + " 'Four, and the X-Men, together with those '\n", + " 'created during the Golden Age such as Captain '\n", + " 'America, are known worldwide and have become '\n", + " 'cultural icons during the last decades. Later, '\n", + " \"Marvel's characters popularity has been \"\n", + " 'revitalized even more due to the release of '\n", + " 'several recent movies which recreate the comic '\n", + " 'books using spectacular modern special '\n", + " 'effects. Nowadays, it is possible to access '\n", + " 'the content of the comic books via a digital '\n", + " 'platform created by Marvel, where it is '\n", + " 'possible to subscribe monthly or yearly to get '\n", + " 'access to the comics. More information about '\n", + " 'the Marvel Universe can be found here.\\n'\n", + " 'Content\\n'\n", + " 'The dataset contains heroes and comics, and '\n", + " 'the relationship between them. The dataset is '\n", + " 'divided into three files:\\n'\n", + " ' nodes.csv: Contains two columns (node, '\n", + " 'type), indicating the name and the type '\n", + " '(comic, hero) of the nodes.\\n'\n", + " ' edges.csv: Contains two columns (hero, '\n", + " 'comic), indicating in which comics the heroes '\n", + " 'appear.\\n'\n", + " ' hero-edge.csv: Contains the network of '\n", + " 'heroes which appear together in the comics. '\n", + " 'This file was originally taken from '\n", + " 'http://syntagmatic.github.io/exposedata/marvel/\\n',\n", + " 'license': 'https://creativecommons.org/licenses/by-nc-sa/4.0/',\n", + " 'title': 'The Marvel Universe Social Network',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://www.kaggle.com/csanhueza/the-marvel-universe-social-network']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e5'),\n", + " 'description': 'dynamic networks, temporal graphs, download '\n", + " 'dynamic networks, download temporal graph '\n", + " 'data, graph streams, edge stream data, '\n", + " 'time-evolving graph data, proximity networks, '\n", + " 'facebook message data, facebook forum data, '\n", + " 'digg, hospital, primary school proximity '\n", + " 'network, sensor network data, enron data, '\n", + " 'escort network, bitcoin graph data, wikipedia '\n", + " 'election graph, workplace contacts, radoslaw '\n", + " 'email network, youtube-growth',\n", + " 'license': 'http://networkrepository.com/policy.php',\n", + " 'title': 'Dynamic Networks',\n", + " 'topic': 'social graph',\n", + " 'urls': ['http://networkrepository.com/dynamic.php']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e6'),\n", + " 'description': 'Dataset containing the graphs generated using '\n", + " 'pull-requests, issues and commits of the '\n", + " 'top100 starred projects of Github in 2016. In '\n", + " 'each graph, a pair of nodes (developers/users) '\n", + " 'are connected if they interacted on the same '\n", + " 'Github project element, e.g.: in the '\n", + " 'pull-request graph, they are connected if they '\n", + " 'have handled the same PR (similar with issues '\n", + " 'and committed files for the other two types of '\n", + " 'graph). A derived graph called supergraph is '\n", + " 'loaded in the dataset: it is calculated as the '\n", + " 'union of previous 3 graph types, e.g: two '\n", + " 'users are connected in this graph if they have '\n", + " 'interacted on the same issues, or on the same '\n", + " 'pull request or if they committed to the same '\n", + " 'file. A list of the projects is available, '\n", + " 'containing both Github IDs and complete name '\n", + " 'of the repository.',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'title': 'Github interaction graphs for rich-club behavior '\n", + " 'analysis',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AA4IIS',\n", + " 'https://doi.org/10.7910/DVN/AA4IIS']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e7'),\n", + " 'description': 'The data set has been downloaded via the '\n", + " 'OAI-PMH endpoint of the Berlin State '\n", + " 'Library/Staatsbibliothek zu Berlin’s Digitized '\n", + " 'Collections '\n", + " '(https://digital.staatsbibliothek-berlin.de/oai) '\n", + " 'on March 1st 2019 and converted into common '\n", + " 'tabular formats on the basis of the provided '\n", + " 'Dublin Core metadata. It contains 146,000 '\n", + " 'records. In addition to the bibliographic '\n", + " 'metadata, representative images of the works '\n", + " 'have been downloaded, resized to a 512 pixel '\n", + " 'maximum thumbnail image and saved in JPEG '\n", + " 'format. The image data is split into title '\n", + " 'pages and first pages. Title pages have been '\n", + " 'derived from structural metadata created by '\n", + " 'scan operators and librarians. If this '\n", + " 'information was not available, first pages of '\n", + " 'the media have been downloaded. In case of '\n", + " 'multi-volume media, title pages are not '\n", + " 'available. In total, 141,206 images '\n", + " 'title/first pages are available.\\n'\n", + " 'Furthermore, the tabular data has been cleaned '\n", + " 'and extended with geo-spatial coordinates '\n", + " 'provided by the OpenStreetMap project '\n", + " '(https://www.openstreetmap.org). The actual '\n", + " 'data processing steps are summarized in the '\n", + " 'next section. For the sake of transparency and '\n", + " 'reproducibility, the original data taken from '\n", + " 'the OAI-PMH endpoint is still present in the '\n", + " 'table.\\n'\n", + " 'To conclude with, various graphs in GML file '\n", + " 'format are available that can be loaded '\n", + " 'directly into graph analysis tools such as '\n", + " 'Gephi (https://gephi.org/).\\n'\n", + " 'The implementation of the data processing '\n", + " 'steps (incl. graph creation) are available as '\n", + " 'a Jupyter notebook provided at '\n", + " 'https://github.com/elektrobohemian/SBBrowse2018/blob/master/DataProcessing.ipynb.',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'title': 'Metadata, Title Pages, and Network Graph of the '\n", + " 'Digitized Content of the Berlin State Library '\n", + " '(146,000 items)',\n", + " 'topic': 'publication',\n", + " 'urls': ['https://figshare.com/articles/Metadata_Title_Pages_and_Network_Graph_of_the_Digitized_Content_of_the_Berlin_State_Library_146_000_items_/7797170',\n", + " 'https://zenodo.org/record/2582482',\n", + " 'https://doi.org/10.5281/zenodo.2582482']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e8'),\n", + " 'description': 'This is the graph database from the startup '\n", + " 'Relato (2015-2016). It contains links between '\n", + " 'businesses pulled from the web. It contains '\n", + " '373,663 links between companies, of the types '\n", + " '\"partnership\" (one company listed on another '\n", + " 'company\\'s partnership page), \"customer\" (one '\n", + " \"company listed on another company's example \"\n", + " 'customer page), \"competitor\" (co-bidders on '\n", + " 'AdWords above some limit), \"investment\" (a '\n", + " 'company listed on a VC\\'s website), \"supplier\" '\n", + " '(the inverse of the \"customer\" type. This '\n", + " 'dataset was used to drive both a lead '\n", + " 'generation system where metrics on the graph '\n", + " 'fed into a classification for leads (lead/no '\n", + " 'lead) and a market visualization system (a '\n", + " 'force directed layout of markets and their '\n", + " 'segments).',\n", + " 'license': 'CC-0',\n", + " 'title': 'Relato Business Graph Database',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://data.world/datasyndrome/relato-business-graph-database']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63e9'),\n", + " 'description': 'We study the problem of semi-supervised, '\n", + " 'multi-label user classification of networked '\n", + " 'data in the online social platform setting. We '\n", + " 'propose a framework that combines unsupervised '\n", + " 'community extraction and supervised, '\n", + " 'community-based feature weighting before '\n", + " 'training a classifier. We introduce '\n", + " 'Approximate Regularized Commute-Time Embedding '\n", + " '(ARCTE), an algorithm that projects the users '\n", + " 'of a social graph onto a latent space, but '\n", + " 'instead of packing the global structure into a '\n", + " 'matrix of predefined rank, as many spectral '\n", + " 'and neural representation learning methods do, '\n", + " 'it extracts local communities for all users in '\n", + " 'the graph in order to learn a sparse '\n", + " 'embedding. To this end, we employ an '\n", + " 'improvement of personalized PageRank '\n", + " 'algorithms for searching locally in each '\n", + " 'user’s graph structure. Then, we perform '\n", + " 'supervised community feature weighting in '\n", + " 'order to boost the importance of highly '\n", + " 'predictive communities. We assess our method '\n", + " 'performance on the problem of user '\n", + " 'classification by performing an extensive '\n", + " 'comparative study among various recent methods '\n", + " 'based on graph embeddings. The comparison '\n", + " 'shows that ARCTE significantly outperforms the '\n", + " 'competition in almost all cases, achieving up '\n", + " 'to 35% relative improvement compared to the '\n", + " 'second best competing method in terms of '\n", + " 'F1-score.',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0/',\n", + " 'title': 'Multilabel user classification using the community '\n", + " 'structure of online networks',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://figshare.com/articles/Multilabel_user_classification_using_the_community_structure_of_online_networks/4740307/1',\n", + " 'https://doi.org/10.1371/journal.pone.0173347']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63ea'),\n", + " 'description': 'meetup.com is a website for people organizing '\n", + " 'and attending regular or semi-regular events '\n", + " '(\"meet-ups\"). The relationships amongst '\n", + " 'users—who goes to what meetups—are a social '\n", + " 'network, ideal for graph-based analysis.\\n'\n", + " 'This dataset was generated for a talk titled '\n", + " 'Principles of Network Analysis with NetworkX, '\n", + " 'embedded online here (or with notebooks, etc. '\n", + " 'on Github). It forms the basis for a series of '\n", + " 'tutorials I presented on at PyNash and '\n", + " 'PyTennessee. In them, we work through the '\n", + " 'basics of graph theory and how to use '\n", + " 'NetworkX, a popular open-source Python '\n", + " 'package. We then apply this knowledge to '\n", + " 'extract insights about the social fabric of '\n", + " 'Tennessee MeetUp groups. ',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'title': 'Nashville Meetup Network',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://www.kaggle.com/stkbailey/nashville-meetup']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63eb'),\n", + " 'description': 'The project analysed the performance of '\n", + " 'community detection algorithms on the Twitter '\n", + " 'social network operating on a graph compressed '\n", + " 'using minhash signatures. The data supplied '\n", + " 'gives minhash signatures of roughly 16,000 '\n", + " 'Twitter users who have been classified into 16 '\n", + " 'categories. It is described in '\n", + " 'https://arxiv.org/abs/1601.03958 and together '\n", + " 'with code at '\n", + " 'https://github.com/melifluos/LSH-community-detection '\n", + " 'allows the results within to be replicated.',\n", + " 'license': 'NA',\n", + " 'title': 'Data from: Real-Time Community Detection in Full '\n", + " 'Social Networks on a Laptop',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://www.narcis.nl/dataset/RecordID/oai%3Aeasy.dans.knaw.nl%3Aeasy-dataset%3A76304',\n", + " 'https://doi.org/10.17026/dans-2bc-4qgc']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63ec'),\n", + " 'description': 'NILMPEds (NILM Performance Evaluation '\n", + " 'dataset), is a different type of NILM dataset, '\n", + " 'in a sense that it is aimed primarily at '\n", + " 'research reproducibility with respect to the '\n", + " 'development and performance evaluation of '\n", + " 'event detection algorithms. NILMPEds contains '\n", + " 'the results of 47950 event detection models '\n", + " 'when applied to four public event detection '\n", + " 'datasets. The different parameter '\n", + " 'configuration of each model and the '\n", + " 'ground-truth data are also available. Finally, '\n", + " 'this dataset also contains the performance '\n", + " 'evaluation of each model according to 31 '\n", + " 'performance metrics.',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'title': 'Data from: NILMPEds: A Performance Evaluation '\n", + " 'Dataset for Event Detection Algorithms in '\n", + " 'Non-Intrusive Load Monitoring',\n", + " 'topic': 'NILM',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/BUPJNR',\n", + " 'https://doi.org/10.7910/DVN/BUPJNR']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63ed'),\n", + " 'description': 'This dataset was created by sifting through '\n", + " 'the REFIT dataset to detect load anomalies; '\n", + " 'the rules for labelling anomalies are '\n", + " \"described in the accompanying ICASSP'19 paper, \"\n", + " 'which should be referenced if the dataset is '\n", + " 'used. Five of the 20 houses of the REFIT '\n", + " 'dataset were included in this dataset, as they '\n", + " 'contained the largest number of detected '\n", + " 'anomalies. These are Houses 1, 10, 16, 18 and '\n", + " '20. At the time of release, this is the first '\n", + " 'detailed annotated dataset of anomalies within '\n", + " 'publicly available electrical load '\n", + " 'measurements. These are real anomalies, not '\n", + " 'simulated ones and are extremely useful in '\n", + " 'understanding anomalous behaviour of '\n", + " 'electrical appliances, as measured by smart '\n", + " 'meters. Anomalous behaviour of the following '\n", + " 'appliances is included in this dataset: '\n", + " 'Refrigerator, freezer, fridge-freezer, '\n", + " 'dishwasher, washing machine, tumble dryer, '\n", + " 'electrical heater and microwave. When using '\n", + " 'this dataset, please cite the following paper: '\n", + " 'H. Rashid, V. Stankovic, L. Stankovic and P. '\n", + " 'Singh, \"Evaluation of Non-Intrusive Load '\n", + " 'Monitoring Algorithms for Appliance-level '\n", + " 'Anomaly Detection,\" Proc. IEEE 44th Int. Conf. '\n", + " 'on Acoustics, Speech and Signal Processing '\n", + " '(ICASSP), Brighton, UK, May 2019.',\n", + " 'license': 'http://creativecommons.org/licenses/by/4.0',\n", + " 'title': 'Annotated load anomalies from the REFIT Dataset',\n", + " 'topic': 'NILM',\n", + " 'urls': ['https://search.datacite.org/works/10.15129/9729a2a0-11ce-4cce-b0d0-144c483fcb33',\n", + " 'https://doi.org/10.15129/9729a2a0-11ce-4cce-b0d0-144c483fcb33']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63ee'),\n", + " 'description': 'This SBIR Phase I project will develop the '\n", + " 'Spacecraft Power Monitor (SPM) which will use '\n", + " 'non-intrusive electrical monitoring (NEMO). '\n", + " 'NEMO transforms the power distribution network '\n", + " 'in an spacecraft into a multiple-use service, '\n", + " 'providing not only power distribution but also '\n", + " 'a diagnostic monitoring capability based on '\n", + " 'careful measurement and analysis of power '\n", + " 'usage and start up and shut down transients. '\n", + " 'In depth analysis of this data enables real '\n", + " 'time assessment of system and component '\n", + " 'functioning and identifies potential system '\n", + " 'and component faults and failutes. We will use '\n", + " \"NEMO's ability to track load operation to \"\n", + " 'verify that the systems and components of a '\n", + " 'spacecraft are operating properly This '\n", + " '\"spacecraft power monitor\" or SPM, based on '\n", + " 'NEMO, will notify astronauts or ground support '\n", + " 'personnel when unexpected sequences occur. It '\n", + " 'can also generally track the health and '\n", + " 'diagnostic condition of key loads on the '\n", + " 'system. The system is light weight, small and '\n", + " 'inexpensive because the system requires only a '\n", + " 'sensor at the mains power input and uses '\n", + " 'existing power wiring to carry data. Phase I '\n", + " 'will involve ground measurements of spacecraft '\n", + " 'components. Phase II will involve measurements '\n", + " 'and analysis of an integrated system.',\n", + " 'license': 'NA',\n", + " 'title': 'Spacecraft Power Monitor, Phase I',\n", + " 'topic': 'anomaly detection',\n", + " 'urls': ['https://data.nasa.gov/dataset/Spacecraft-Power-Monitor-Phase-I/hw4h-ctgu',\n", + " 'https://catalog.data.gov/dataset/spacecraft-power-monitor-phase-i']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63ef'),\n", + " 'description': 'This archive contains replication data and the '\n", + " 'supplemental appendix for the article: '\n", + " 'Stinnett, Douglas, Bryan Early, Cale Horne, '\n", + " 'and Johannes Karreth. 2011. \"Complying by '\n", + " 'Denying: Explaining Why States Develop '\n", + " 'Nonproliferation Export Controls.\" '\n", + " 'International Studies Perspectives 12 (3), '\n", + " '308-326. We provide original data for the '\n", + " 'nonproliferation export control systems of 30 '\n", + " 'randomly selected countries, measured in '\n", + " 'multiple dimensions.',\n", + " 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',\n", + " 'title': 'Replication data for: Complying by Denying: '\n", + " 'Explaining Why States Develop Nonproliferation '\n", + " 'Export Controls',\n", + " 'topic': 'social graph',\n", + " 'urls': ['https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/W3B61P',\n", + " 'https://doi.org/10.7910/DVN/W3B61P']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63f0'),\n", + " 'description': 'The Department of Energy (DOE) Office of '\n", + " 'Scientific and Technical Information (OSTI), a '\n", + " 'unit of the Office of Science, fulfills '\n", + " 'agency-wide responsibilities to collect, '\n", + " 'preserve, and disseminate both unclassified '\n", + " 'and classified scientific and technical '\n", + " 'information (STI) emanating from DOE-funded '\n", + " 'research and development (R&D) activities at '\n", + " 'DOE national laboratories and facilities and '\n", + " 'at universities and other institutions '\n", + " 'nationwide. OSTI provides access to DOE STI '\n", + " 'through a suite of web-based, searchable '\n", + " 'discovery tools and through other commonly '\n", + " 'used search engines, offering ever-expanding '\n", + " 'sources of R&D information to DOE, the '\n", + " 'research community, and the science-attentive '\n", + " 'public.',\n", + " 'license': 'Public Domain',\n", + " 'title': 'U.S. Department of Energy Office of Scientific and '\n", + " 'Technical Information',\n", + " 'topic': 'publication',\n", + " 'urls': ['https://www.osti.gov/', 'https://www.osti.gov/MARC/']}\n", + "{'_id': ObjectId('5d93f62b9f7010afdece63f1'),\n", + " 'description': 'This is the sentiment140 dataset. It contains '\n", + " '1,600,000 tweets extracted using the twitter '\n", + " 'api . The tweets have been annotated (0 = '\n", + " 'negative, 4 = positive) and they can be used '\n", + " 'to detect sentiment .',\n", + " 'license': 'NA',\n", + " 'title': 'Sentiment140 dataset with 1.6 million tweets',\n", + " 'topic': 'social network',\n", + " 'urls': ['https://www.kaggle.com/kazanova/sentiment140']}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import pymongo, json\n", + "client = pymongo.MongoClient(host=\"da1.eecs.utk.edu\")\n", + "db = client['fdac19mp2']\n", + "coll = db['madams56']\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "for r in coll.find():\n", + " print(pp.pformat(r)) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}