-
Notifications
You must be signed in to change notification settings - Fork 46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
organize an example pagerank app employing the gar library (#44) #46
Changes from 1 commit
498cf3e
095e137
1d09b4d
c1596c3
8350fa2
c522cb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,4 +9,7 @@ spark/target/ | |
# docs | ||
/docs/_build/ | ||
|
||
# examples | ||
/examples/*/build | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
cmake_minimum_required(VERSION 2.8) | ||
|
||
# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24: | ||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") | ||
cmake_policy(SET CMP0135 NEW) | ||
endif() | ||
|
||
project(Pagerank) | ||
|
||
find_package(Threads REQUIRED) | ||
find_package(gar REQUIRED) | ||
|
||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) | ||
include(apache-arrow) | ||
build_arrow() | ||
|
||
include_directories(BEFORE SYSTEM ${ARROW_INCLUDE_DIR}) | ||
|
||
include_directories(${GAR_INCLUDE_DIRS}) | ||
|
||
add_executable(pagerank pagerank.cc) | ||
target_compile_features(pagerank PRIVATE cxx_std_17) | ||
target_link_libraries(pagerank PRIVATE ${GAR_LIBRARIES}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
A PageRank Example | ||
------------------- | ||
|
||
This example demonstrates how to compute the PageRank of an input graph using GAR and write back the values as a new property of the graph. | ||
|
||
Integrate GAR | ||
^^^^^^^^^^^^^^^^^^ | ||
|
||
To include GAR C++ library, add the following commands in the CMakeLists.txt: | ||
|
||
.. code-block:: cmake | ||
|
||
find_package(gar REQUIRED) | ||
include_directories(${GAR_INCLUDE_DIRS}) | ||
target_link_libraries(pagerank PRIVATE ${GAR_LIBRARIES}) | ||
|
||
|
||
Build the Project | ||
^^^^^^^^^^^^^^^^^^ | ||
|
||
.. code-block:: shell | ||
|
||
mkdir build && cd build | ||
cmake .. | ||
make | ||
|
||
|
||
Prepare the input graph | ||
^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
Copy the ldbc_sample graph from test | ||
|
||
.. code-block:: shell | ||
|
||
copy -r ${project_dir}/test/gar-test/ldbc_sample /tmp/ | ||
|
||
|
||
Run the example | ||
^^^^^^^^^^^^^^^^ | ||
|
||
.. code-block:: shell | ||
|
||
./pagerank | ||
|
||
The output looks like: | ||
|
||
.. code-block:: shell | ||
|
||
num_vertices: 903 | ||
iter 0 | ||
iter 1 | ||
iter 2 | ||
iter 3 | ||
iter 4 | ||
iter 5 | ||
iter 6 | ||
iter 7 | ||
iter 8 | ||
iter 9 | ||
Done |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto. We can reuse the apache-arrow.cmake of GraphAr. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer remove arrow from our example |
||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
# This cmake file is referred and derived from | ||
# https://github.com/apache/arrow/blob/master/matlab/CMakeLists.txt | ||
|
||
|
||
# Build the Arrow C++ libraries. | ||
function(build_arrow) | ||
set(one_value_args) | ||
set(multi_value_args) | ||
|
||
cmake_parse_arguments(ARG | ||
"${options}" | ||
"${one_value_args}" | ||
"${multi_value_args}" | ||
${ARGN}) | ||
if (ARG_UNPARSED_ARGUMENTS) | ||
message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") | ||
endif () | ||
|
||
find_package(Threads) | ||
# If Arrow needs to be built, the default location will be within the build tree. | ||
set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-prefix") | ||
|
||
set(ARROW_STATIC_LIBRARY_DIR "${ARROW_PREFIX}/lib") | ||
|
||
set(ARROW_STATIC_LIB_FILENAME | ||
"${CMAKE_STATIC_LIBRARY_PREFIX}arrow${CMAKE_STATIC_LIBRARY_SUFFIX}") | ||
set(ARROW_STATIC_LIB "${ARROW_STATIC_LIBRARY_DIR}/${ARROW_STATIC_LIB_FILENAME}") | ||
set(PARQUET_STATIC_LIB_FILENAME | ||
"${CMAKE_STATIC_LIBRARY_PREFIX}parquet${CMAKE_STATIC_LIBRARY_SUFFIX}") | ||
set(PARQUET_STATIC_LIB "${ARROW_STATIC_LIBRARY_DIR}/${PARQUET_STATIC_LIB_FILENAME}" CACHE INTERNAL "parquet lib") | ||
set(ARROW_BUNDLED_DEPS_STATIC_LIB_FILENAME | ||
"${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}") | ||
set(ARROW_BUNDLED_DEPS_STATIC_LIB | ||
"${ARROW_STATIC_LIBRARY_DIR}/${ARROW_BUNDLED_DEPS_STATIC_LIB_FILENAME}" CACHE INTERNAL "bundled deps lib") | ||
|
||
set(ARROW_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-build") | ||
set(ARROW_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" | ||
"-DARROW_BUILD_STATIC=ON" "-DARROW_BUILD_SHARED=OFF" | ||
"-DARROW_DEPENDENCY_SOURCE=BUNDLED" "-DARROW_DEPENDENCY_USE_SHARED=OFF" | ||
"-DCMAKE_INSTALL_LIBDIR=lib" "-Dxsimd_SOURCE=BUNDLED" | ||
"-DARROW_PARQUET=ON" "-DARROW_WITH_RE2=OFF" | ||
"-DARROW_WITH_UTF8PROC=OFF" "-DARROW_WITH_RE2=OFF" | ||
"-DARROW_FILESYSTEM=ON" "-DARROW_CSV=ON" "-DARROW_PYTHON=OFF" | ||
"-DARROW_BUILD_BENCHMAKRS=OFF" "-DARROW_BUILD_TESTS=OFF" | ||
"-DARROW_BUILD_INTEGRATION=OFF" "-DBoost_SOURCE=BUNDLED" | ||
"-DARROW_ORC=ON" "-DARROW_COMPUTE=ON" | ||
"-DARROW_DATASET=ON" "-DARROW_WITH_SNAPPY=OFF" "-DARROW_WITH_LZ4=OFF" | ||
"-DARROW_WITH_ZSTD=ON" "-DARROW_WITH_ZLIB=OFF" "-DARROW_WITH_BROTLI=OFF" "-DARROW_WITH_BZ2=OFF") | ||
|
||
set(ARROW_INCLUDE_DIR "${ARROW_PREFIX}/include" CACHE INTERNAL "arrow include directory") | ||
set(ARROW_BUILD_BYPRODUCTS "${ARROW_STATIC_LIB}" "${PARQUET_STATIC_LIB}") | ||
|
||
include(ExternalProject) | ||
externalproject_add(arrow_ep | ||
URL https://www.apache.org/dyn/closer.lua?action=download&filename=arrow/arrow-9.0.0/apache-arrow-9.0.0.tar.gz | ||
SOURCE_SUBDIR cpp | ||
BINARY_DIR "${ARROW_BINARY_DIR}" | ||
CMAKE_ARGS "${ARROW_CMAKE_ARGS}" | ||
BUILD_BYPRODUCTS "${ARROW_BUILD_BYPRODUCTS}") | ||
|
||
set(ARROW_LIBRARY_TARGET arrow_static) | ||
set(PARQUET_LIBRARY_TARGET parquet_static) | ||
|
||
file(MAKE_DIRECTORY "${ARROW_INCLUDE_DIR}") | ||
add_library(${ARROW_LIBRARY_TARGET} STATIC IMPORTED) | ||
add_library(${PARQUET_LIBRARY_TARGET} STATIC IMPORTED) | ||
set_target_properties(${ARROW_LIBRARY_TARGET} | ||
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_INCLUDE_DIR} | ||
IMPORTED_LOCATION ${ARROW_STATIC_LIB}) | ||
set_target_properties(${PARQUET_LIBRARY_TARGET} | ||
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_INCLUDE_DIR} | ||
IMPORTED_LOCATION ${PARQUET_STATIC_LIB}) | ||
|
||
add_dependencies(${ARROW_LIBRARY_TARGET} arrow_ep) | ||
endfunction() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
/** Copyright 2022 Alibaba Group Holding Limited. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
#include <filesystem> | ||
#include <iostream> | ||
|
||
#include "arrow/api.h" | ||
#include "arrow/filesystem/api.h" | ||
|
||
#include "gar/graph.h" | ||
#include "gar/graph_info.h" | ||
#include "gar/reader/arrow_chunk_reader.h" | ||
#include "gar/writer/arrow_chunk_writer.h" | ||
|
||
|
||
int main(int argc, char* argv[]) { | ||
// read file and construct graph info | ||
std::string path = "/tmp/ldbc_sample/parquet/ldbc_sample.graph.yml"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a little hard-code. Use the relative path to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My purpose is to make this example a new project over the GAR library. |
||
auto graph_info = GraphArchive::GraphInfo::Load(path).value(); | ||
|
||
// construct vertices collection | ||
std::string label = "person"; | ||
auto maybe_vertices = | ||
GraphArchive::ConstructVerticesCollection(graph_info, label); | ||
auto& vertices = maybe_vertices.value(); | ||
int num_vertices = vertices.size(); | ||
std::cout << "num_vertices: " << num_vertices << std::endl; | ||
|
||
// construct edges collection | ||
std::string src_label = "person", edge_label = "knows", dst_label = "person"; | ||
auto maybe_edges = GraphArchive::ConstructEdgesCollection( | ||
graph_info, src_label, edge_label, dst_label, | ||
GraphArchive::AdjListType::ordered_by_source); | ||
auto& edges = std::get<GraphArchive::EdgesCollection< | ||
GraphArchive::AdjListType::ordered_by_source>>(maybe_edges.value()); | ||
|
||
// run pagerank algorithm | ||
const double damping = 0.85; | ||
const int max_iters = 10; | ||
std::vector<double> pr_curr(num_vertices); | ||
std::vector<double> pr_next(num_vertices); | ||
std::vector<GraphArchive::IdType> out_degree(num_vertices); | ||
for (GraphArchive::IdType i = 0; i < num_vertices; i++) { | ||
pr_curr[i] = 1 / static_cast<double>(num_vertices); | ||
pr_next[i] = 0; | ||
out_degree[i] = 0; | ||
} | ||
auto it_begin = edges.begin(), it_end = edges.end(); | ||
for (auto it = it_begin; it != it_end; ++it) { | ||
GraphArchive::IdType src = it.source(); | ||
out_degree[src]++; | ||
} | ||
for (int iter = 0; iter < max_iters; iter++) { | ||
std::cout << "iter " << iter << std::endl; | ||
for (auto it = it_begin; it != it_end; ++it) { | ||
GraphArchive::IdType src = it.source(), dst = it.destination(); | ||
pr_next[dst] += pr_curr[src] / out_degree[src]; | ||
} | ||
for (GraphArchive::IdType i = 0; i < num_vertices; i++) { | ||
pr_next[i] = damping * pr_next[i] + | ||
(1 - damping) * (1 / static_cast<double>(num_vertices)); | ||
if (out_degree[i] == 0) | ||
pr_next[i] += damping * pr_curr[i]; | ||
pr_curr[i] = pr_next[i]; | ||
pr_next[i] = 0; | ||
} | ||
} | ||
|
||
// extend the original vertex info and write results to gar using writer | ||
// construct property group | ||
GraphArchive::Property pagerank = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a pagerank example, I don't know if we just write the result to file not extend as a property would be better. That would have two benefits:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. To remove arrow here, you can use VertexBuilder instead of VertexPropertyWriter to write results. (please refer to "method 1 for writing results" in https://github.com/alibaba/GraphAr/blob/main/test/test_example/test_bgl_example.cc) |
||
"pagerank", GraphArchive::DataType(GraphArchive::Type::DOUBLE), false}; | ||
std::vector<GraphArchive::Property> property_vector = {pagerank}; | ||
GraphArchive::PropertyGroup group(property_vector, | ||
GraphArchive::FileType::PARQUET); | ||
// extend the vertex_info | ||
auto maybe_vertex_info = graph_info.GetVertexInfo(label); | ||
auto vertex_info = maybe_vertex_info.value(); | ||
auto maybe_extend_info = vertex_info.Extend(group); | ||
auto extend_info = maybe_extend_info.value(); | ||
// dump the extened vertex info | ||
assert(extend_info.IsValidated()); | ||
assert(extend_info.Dump().status().ok()); | ||
assert(extend_info.Save("/tmp/person-new-pagerank.vertex.yml").ok()); | ||
// construct vertex property writer | ||
GraphArchive::VertexPropertyWriter writer(extend_info, "/tmp/"); | ||
// convert results to arrow::Table | ||
std::vector<std::shared_ptr<arrow::Array>> arrays; | ||
std::vector<std::shared_ptr<arrow::Field>> schema_vector; | ||
schema_vector.push_back(arrow::field( | ||
pagerank.name, | ||
GraphArchive::DataType::DataTypeToArrowDataType(pagerank.type))); | ||
arrow::DoubleBuilder array_builder; | ||
assert(array_builder.Reserve(num_vertices).ok()); | ||
assert(array_builder.AppendValues(pr_curr).ok()); | ||
std::shared_ptr<arrow::Array> array = array_builder.Finish().ValueOrDie(); | ||
arrays.push_back(array); | ||
auto schema = std::make_shared<arrow::Schema>(schema_vector); | ||
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays); | ||
// dump the results through writer | ||
assert(writer.WriteTable(table, group, 0).ok()); | ||
|
||
std::cout << "Done" << std::endl; | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if we need a CMakeList here. Seems put these to CMakeLists in root path and make example as a new target would be better.
A new CMakeList.txt here would bring some cons:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. I think what you suggested is the in-project examples. What I want to show here is a new application employing the GAR library, in which case, users should first install GAR, and then build their own application projects.