diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 7ba274c5960..462b4d977a9 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -38,3 +38,8 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") +# soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") # 250MB +# cit_patents = Dataset(meta_path / "cit-patents.yaml") # 965MB +# europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.8GB +# hollywood = Dataset(meta_path / "hollywood.yaml") # 1.5GB +# twitter = Dataset(meta_path / "soc-twitter-2010.yaml") # 8.8GB diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml new file mode 100644 index 00000000000..d5c4cf195bd --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -0,0 +1,22 @@ +name: cit-Patents +file_type: .csv +description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. +author: NBER +refs: + J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. + ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 16518948 +number_of_nodes: 3774768 +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml new file mode 100644 index 00000000000..fe0e42a4b86 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -0,0 +1,21 @@ +name: europe_osm +file_type: .csv +description: A graph of OpenStreetMap data for Europe. +author: M. Kobitzsh / Geofabrik GmbH +refs: + Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 54054660 +number_of_nodes: 50912018 +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml new file mode 100644 index 00000000000..8a671c98269 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -0,0 +1,25 @@ +name: hollywood +file_type: .csv +description: + A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. +author: Laboratory for Web Algorithmics (LAW) +refs: + "The WebGraph Framework I: Compression Techniques," Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + pp. 595--601, ACM Press. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 113891327 +number_of_nodes: 1139905 +url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml new file mode 100644 index 00000000000..df11dd9a364 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -0,0 +1,22 @@ +name: soc-LiveJournal1 +file_type: .csv +description: A graph of the LiveJournal social network. +author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan +refs: + L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in + Large Social Networks Membership, Growth, and Evolution. KDD, 2006. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 68993773 +number_of_nodes: 4847571 +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml new file mode 100644 index 00000000000..5ae2cf7deeb --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -0,0 +1,22 @@ +name: soc-twitter-2010 +file_type: .csv +description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. +author: H. Kwak, C. Lee, H. Park, S. Moon +refs: + J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl. + Conf. on Web Search and Data Mining (WSDM '11), 2011. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 530051354 +number_of_nodes: 21297772 +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 643d0468d46..2b6c3820632 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -94,6 +94,7 @@ def setup_deprecation_warning_tests(): # Helpers # check if there is a row where src == dst +# Should this be renamed to 'has_self_loop'? def has_loop(df): df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"])