Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generate latest docs on CI, from commit 106912d.
Browse files Browse the repository at this point in the history
github-actions committed Jan 24, 2024
0 parents commit 48a1f3b
Showing 744 changed files with 275,998 additions and 0 deletions.
88 changes: 88 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright 2022-2023 Alibaba Group Holding Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/javascript-node
{
"name": "GraphAr",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "registry.cn-hongkong.aliyuncs.com/graphscope/graphar-dev:latest",
// "image": "ubuntu:22.04",

// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/common-utils:2":{
"installZsh": "true",
"configureZshAsDefaultShell": "true",
"installOhMyZsh": true,
"upgradePackages": "false"
}
},
// Configure tool-specific properties.
"customizations": {
// Configure properties specific to VS Code.
"vscode": {
"settings": {},
"extensions": [
"streetsidesoftware.code-spell-checker",
"eamodio.gitlens",
"github.copilot",
"github.copilot-labs"
]
}
},

// Set `remoteUser` to `root` to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
"remoteUser": "graphar",

// Use 'postCreateCommand' to run commands before the container is created.
"initializeCommand": "sudo docker pull registry.cn-hongkong.aliyuncs.com/graphscope/graphar-dev:latest",

// Uncomment this to enable C++ and Rust debugging in containers
// "capAdd": ["SYS_PTRACE"],
// "securityOpt": ["seccomp=unconfined"],

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [3000],

// Use 'portsAttributes' to set default properties for specific forwarded ports.
// More info: https://containers.dev/implementors/json_reference/#port-attributes
// "portsAttributes": {
// "9000": {
// "label": "Hello Remote World",
// "onAutoForward": "notify"
// }
// },

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "yarn install"

// Improve performance

// Uncomment these to mount a folder to a volume
// https://code.visualstudio.com/remote/advancedcontainers/improve-performance#_use-a-targeted-named-volume
// "mounts": [
// "source=${localWorkspaceFolderBasename}-node_modules,target=${containerWorkspaceFolder}/node_modules,type=volume"
// ],


// Uncomment these to use a named volume for your entire source tree
// https://code.visualstudio.com/remote/advancedcontainers/improve-performance#_use-a-named-volume-for-your-entire-source-tree
// "workspaceMount": "source=gs,target=/workspaces,type=volume",
// "workspaceFolder": "/workspaces"
"postCreateCommand": "sudo chown -R graphar /workspaces && bash pre-commit/install-hook.sh && bash pre-commit/prepare-commit-msg"
}
83 changes: 83 additions & 0 deletions .gitleaks.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
title = "Gitleaks for Vineyard"

[extend]
useDefault = true

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-key-id"
regex = '''(?i)((LTAI)[a-z0-9]+)'''
keywords = [
"ltai",
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-id-in-config"
regex = '''(?i)((access).?id\s*=\s*.+)'''
keywords = [
"access",
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-key-in-config"
regex = '''(?i)((access).?key\s*=\s*.+)'''
keywords = [
"access",
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-secret-in-config"
regex = '''(?i)((access).?secret\s*=\s*.+)'''
keywords = [
"access",
"secret",
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-key-id-in-config"
regex = '''(?i)((access).?key.?id\s*=\s*.+)'''
keywords = [
"access",
]

[rules.allowlist]
paths = [
'''modules/io/python/drivers/io/tests/test_open.py''',
'''modules/io/python/drivers/io/tests/test_serialize.py''',
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-access-key-secret-in-config"
regex = '''(?i)((access).?key.?secret\s*=\s*.+)'''
keywords = [
"access",
"secret",
]

[rules.allowlist]
paths = [
'''modules/io/python/drivers/io/tests/test_open.py''',
'''modules/io/python/drivers/io/tests/test_serialize.py''',
]

[[rules]]
description = "Alibaba AccessKey ID"
id = "alibaba-secret-access-key-in-config"
regex = '''(?i)((secret).?access.?key\s*=\s*.+)'''
keywords = [
"access",
"secret",
]

[allowlist]
paths = [
'''build''',
'''docs/_build''',
'''docs/_templates/footer.html''',
'''thirdparty''',
]
51 changes: 51 additions & 0 deletions .licenserc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
header:
license:
spdx-id: Apache-2.0
copyright-owner: Apache Software Foundation
content: |
Copyright 2022-2023 Alibaba Group Holding Limited.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
paths-ignore:
- 'dist'
- 'licenses'
- '**/*.md'
- 'LICENSE'
- 'NOTICE'
- 'testing'
- 'spark/src/test/resources'
- 'java/src/test/resources'
- '.licenserc.yaml'
- '.gitignore'
- '.gitleaks.toml'
- '.gitmodules'
- 'pre-commit-config.yaml'
- 'docs'
- '**/.gitignore'
- 'spark/.scalafix.conf'
- 'spark/.scalafmt.conf'
- 'cpp/apidoc'
- 'spark/src/main/scala/com/alibaba/graphar/datasources'
- '*.md'
- '*.rst'
- '**/*.json'
- 'pyspark/poetry.lock' # This file is generated automatically by Poetry-tool; there is no way to add license header

comment: on-failure

# If you don't want to check dependencies' license compatibility, remove the following part
dependency:
files:
- spark/pom.xml # If this is a maven project.
- java/pom.xml # If this is a maven project.
Empty file added .nojekyll
Empty file.
22 changes: 22 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2022-2023 Alibaba Group Holding Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

repos:
- repo: https://github.com/zricethezav/gitleaks
rev: v8.15.0
hooks:
- id: gitleaks
args:
- '--verbose'

Binary file added _images/edge_logical_table.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/edge_physical_table1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/edge_physical_table2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/property_graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/vertex_logical_table.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _images/vertex_physical_table.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
:root {
--tabs-color-label-active: hsla(231, 99%, 66%, 1);
--tabs-color-label-inactive: rgba(178, 206, 245, 0.62);
--tabs-color-overline: rgb(207, 236, 238);
--tabs-color-underline: rgb(207, 236, 238);
--tabs-size-label: 1rem;
}
93 changes: 93 additions & 0 deletions _sources/cpp/examples/bgl.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
Co-Work with BGL
============================

The `Boost Graph Library (BGL) <https://cs.brown.edu/~jwicks/boost/libs/graph/doc/>`_ is the first C++ library to apply the principles of generic programming to the construction of the advanced data structures and algorithms used in graph computations. The BGL graph interface and graph components are generic in the same sense as the Standard Template Library (STL). And it provides some built-in algorithms which cover a core set of algorithm patterns and a larger set of graph algorithms.

We take calculating CC as an example, to demonstrate how BGL works with GraphAr. A weakly connected component is a maximal subgraph of a graph such that for every pair of vertices in it, there is an undirected path connecting them. And the CC algorithm is to identify all such components in a graph. Learn more about `the CC algorithm <https://en.wikipedia.org/wiki/Connected_component>`_.

The source code of CC based on BGL can be found at `bgl_example.cc`_. In this program, the graph information file is first read to get the metadata:

.. code:: C++

std::string path = ... // the path of the graph information file
auto graph_info = GraphArchive::GraphInfo::Load(path).value();

And then, the vertex collection and the edge collection are established as the handles to access the graph data:

.. code:: C++

auto maybe_vertices = GraphArchive::VerticesCollection::Make(graph_info, "person");
auto vertices = maybe_vertices.value();
auto maybe_edges = GraphArchive::EdgesCollection::Make(graph_info, "person", "knows", "person", GraphArchive::AdjListType::ordered_by_source);
auto edges = maybe_edges.value();

Next, we construct the in-memory graph data structure for BGL by traversing the vertices and edges via GraphAr's high-level reading interface (the vertex iterator and the edge iterator):

.. code:: C++

// define the Graph type in BGL
typedef boost::adjacency_list<boost::vecS, // use vector to store edges
boost::vecS, // use vector to store vertices
boost::undirectedS, // undirected
boost::property<boost::vertex_name_t, int64_t>, // vertex property
boost::no_property> Graph; // no edge property
// descriptors for vertex in BGL
typedef typename boost::graph_traits<Graph>::vertex_descriptor Vertex;

// declare a graph object with (num_vertices) vertices and an edge iterator
std::vector<std::pair<GraphArchive::IdType, GraphArchive::IdType>> edges_array;
auto it_begin = edges->begin(), it_end = edges->end();
for (auto it = it_begin; it != it_end; ++it)
edges_array.push_back(std::make_pair(it.source(), it.destination()));
Graph g(edges_array.begin(), edges_array.end(), num_vertices);

// define the internal vertex property "id"
boost::property_map<Graph, boost::vertex_name_t>::type id = get(boost::vertex_name_t(), g);
auto v_it_begin = vertices->begin(), v_it_end = vertices->end();
for (auto it = v_it_begin; it != v_it_end; ++it) {
auto vertex = *it;
boost::put(id, vertex.id(), vertex.property<int64_t>("id").value());
}
After that, an internal CC algorithm provided by BGL is called:

.. code:: C++

// define the external vertex property "component"
std::vector<int> component(num_vertices);
// call algorithm: cc
int cc_num = boost::connected_components(g, &component[0]);
std::cout << "Total number of components: " << cc_num << std::endl;

Finally, we could use a **VerticesBuilder** of GraphAr to write the results to new generated GAR files:

.. code:: C++

// construct a new property group
GraphArchive::Property cc = {"cc", GraphArchive::int32(), false};
std::vector<GraphArchive::Property> property_vector = {cc};
auto group = GraphArchive::CreatePropertyGroup(property_vector, GraphArchive::FileType::PARQUET);

// construct the new vertex info
std::string vertex_label = "cc_result", vertex_prefix = "result/";
int chunk_size = 100;
auto new_info = GraphArchive::CreateVertexInfo(vertex_label, chunk_size, {group}, vertex_prefix);

// access the vertices via the index map and vertex iterator of BGL
typedef boost::property_map<Graph, boost::vertex_index_t>::type IndexMap;
IndexMap index = boost::get(boost::vertex_index, g);
typedef boost::graph_traits<Graph>::vertex_iterator vertex_iter;
std::pair<vertex_iter, vertex_iter> vp;

// dump the results through the VerticesBuilder
GraphArchive::builder::VerticesBuilder builder(new_info, "/tmp/");
for (vp = boost::vertices(g); vp.first!= vp.second; ++vp.first) {
Vertex v = *vp.first;
GraphArchive::builder::Vertex vertex(index[v]);
vertex.AddProperty(cc.name, component[index[v]]);
builder.AddVertex(vertex);
}
builder.Dump();

.. _bgl_example.cc: https://github.com/alibaba/GraphAr/blob/main/cpp/examples/bgl_example.cc
32 changes: 32 additions & 0 deletions _sources/cpp/examples/graphscope.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Integrate into GraphScope
============================

`GraphScope <https://graphscope.io/>`_ is a unified distributed graph computing platform that provides a one-stop environment for performing diverse graph operations on a cluster through a user-friendly Python interface. As an important application case of GraphAr, we have integrated it into GraphScope.

GraphScope works on a graph G fragmented via a partition strategy picked by the user and each worker maintains a fragment of G. Given a query, it posts the same query to all the workers and computes following the BSP (Bulk Synchronous Parallel) model. More specifically, each worker first executes processing against its local fragment, to compute partial answers in parallel. And then each worker may exchange partial results with other processors via synchronous message passing.

To integrate GraphAr into GraphScope, we implemented *ArrowFragmentBuilder* and *ArrowFragmentWriter*. *ArrowFragmentBuilder* establishes the fragments for workers of GraphScope through reading GAR files in parallel. Conversely, *ArrowFragmentWriter* can take the GraphScope fragments and save them as GAR files. If you're interested in knowing more about the implementation, please refer to the `source code <https://github.com/v6d-io/v6d/commit/0eda2067e45fbb4ac46892398af0edc84fe1c27b>`_.


Performance Report
------------------------

Parameter settings
``````````````````
The time performance of *ArrowFragmentBuilder* and *ArrowFragmentWriter* in GraphScope is heavily dependent on the partitioning of the graph into GAR files, that is, the *vertex chunk size* and *edge chunk size*, which are specified in the vertex information file and in the edge information file, respectively. See `GraphAr File Format <../user-guide/file-format.html>`_ to understand the chunk size definitions in GAR.

Generally speaking, fewer chunks are created if the file size is large. On small graphs, this can be disadvantageous as it reduces the degree of parallelism, prolonging disk I/O time. On the other hand, having too many small files increases the overhead associated with the file system and the file parser.

We have conducted micro benchmarks to compare the time performance for reading/writing GAR files by *ArrowFragmentBuilder*/*ArrowFragmentWriter*, across different *vertex chunk size* and *edge chunk size* configurations. The settings we recommend for *vertex chunk size* and *edge chunk size* are **2^18** and **2^22**, respectively, which lead to efficient performance in most cases. These settings can be used as the reference values when integrating GraphAr into other systems besides GraphScope.

Time performance results
````````````````````````
Here we report the performance results of *ArrowFragmentBuilder*, and compare it with loading the same graph through the default loading strategy of GraphScope (through reading the csv files in parallel) . The execution time reported below includes loading the graph data from the disk into memory, as well as building GraphScope fragments from such data. The experiments are conducted on a cluster of 4 AliCloud ecs.r6.6xlarge instances (24vCPU, 192GB memory), and using `com-friendster <https://snap.stanford.edu/data/com-Friendster.html>`_ (a simple graph) and `ldbc-snb-30 <https://ldbcouncil.org/benchmarks/snb/>`_ (a multi-labeled property graph) as datasets.

+----------------+---------+-----------------+-----------------+
| Dataset | Workers | Default Loading | GraphAr Loading |
+================+=========+=================+=================+
| com-friendster | 4 | 282s | 54s |
+----------------+---------+-----------------+-----------------+
| ldbc-snb-30 | 4 | 196s | 40s |
+----------------+---------+-----------------+-----------------+
9 changes: 9 additions & 0 deletions _sources/cpp/examples/index.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Examples
---------

.. toctree::
:maxdepth: 2

bgl
graphscope
out-of-core
Loading

0 comments on commit 48a1f3b

Please sign in to comment.