Skip to content

Commit

Permalink
[Feature](multi-catalog) Support hadoop viewfs.
Browse files Browse the repository at this point in the history
  • Loading branch information
kaka11chen committed Sep 11, 2023
1 parent 586492c commit 722ca94
Show file tree
Hide file tree
Showing 12 changed files with 163 additions and 10 deletions.
8 changes: 2 additions & 6 deletions be/src/io/fs/hdfs_file_system.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, const std::string
_hdfs_params(hdfs_params),
_fs_handle(nullptr),
_profile(profile) {
if (_hdfs_params.__isset.fs_name) {
if (fs_name.empty() && _hdfs_params.__isset.fs_name) {
_fs_name = _hdfs_params.fs_name;
} else {
_fs_name = fs_name;
Expand Down Expand Up @@ -512,11 +512,7 @@ Status HdfsFileSystemCache::get_connection(const THdfsParams& hdfs_params,
uint64 HdfsFileSystemCache::_hdfs_hash_code(const THdfsParams& hdfs_params,
const std::string& fs_name) {
uint64 hash_code = 0;
if (hdfs_params.__isset.fs_name) {
hash_code += Fingerprint(hdfs_params.fs_name);
} else {
hash_code += Fingerprint(fs_name);
}
hash_code += Fingerprint(fs_name);
if (hdfs_params.__isset.user) {
hash_code += Fingerprint(hdfs_params.user);
}
Expand Down
22 changes: 22 additions & 0 deletions docs/en/docs/lakehouse/multi-catalog/hive.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,28 @@ Please place the `krb5.conf` file and `keytab` authentication file under all `BE

The value of `hive.metastore.kerberos.principal` needs to be consistent with the property of the same name of the connected hive metastore, which can be obtained from `hive-site.xml`.

### Hive On VIEWFS

```sql
CREATE CATALOG hive PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.0.0.1:9083',
'hadoop.username' = 'hive',
'dfs.nameservices'='your-nameservice',
'dfs.ha.namenodes.your-nameservice'='nn1,nn2',
'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088',
'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088',
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider',
'fs.defaultFS' = 'viewfs://your-cluster',
'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/',
'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1'
);
```

viewfs related parameters can be added to the catalog configuration as above, or added to `conf/core-site.xml`.

How viewfs works and parameter configuration, please refer to relevant hadoop documents, for example, https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html

### Hive On JuiceFS

Data is stored in JuiceFS, examples are as follows:
Expand Down
22 changes: 22 additions & 0 deletions docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,28 @@ CREATE CATALOG hive PROPERTIES (
请在所有的 `BE``FE` 节点下放置 `krb5.conf` 文件和 `keytab` 认证文件,`keytab` 认证文件路径和配置保持一致,`krb5.conf` 文件默认放置在 `/etc/krb5.conf` 路径。
`hive.metastore.kerberos.principal` 的值需要和所连接的 hive metastore 的同名属性保持一致,可从 `hive-site.xml` 中获取。

### Hive On VIEWFS

```sql
CREATE CATALOG hive PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.0.0.1:9083',
'hadoop.username' = 'hive',
'dfs.nameservices'='your-nameservice',
'dfs.ha.namenodes.your-nameservice'='nn1,nn2',
'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088',
'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088',
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider',
'fs.defaultFS' = 'viewfs://your-cluster',
'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/',
'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1'
);
```

viewfs 相关参数可以如上面一样添加到 catalog 配置中,也可以添加到 `conf/core-site.xml` 中。

viewfs 工作原理和参数配置可以参考 hadoop 相关文档,比如 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html

### Hive On JuiceFS

数据存储在JuiceFS,示例如下:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static void checkPath(String path, StorageBackend.StorageType type) throw
if (!schema.equalsIgnoreCase("bos")
&& !schema.equalsIgnoreCase("afs")
&& !schema.equalsIgnoreCase("hdfs")
&& !schema.equalsIgnoreCase("viewfs")
&& !schema.equalsIgnoreCase("ofs")
&& !schema.equalsIgnoreCase("obs")
&& !schema.equalsIgnoreCase("oss")
Expand All @@ -58,8 +59,9 @@ public static void checkPath(String path, StorageBackend.StorageType type) throw
&& !schema.equalsIgnoreCase("gfs")
&& !schema.equalsIgnoreCase("jfs")
&& !schema.equalsIgnoreCase("gs")) {
throw new AnalysisException("Invalid broker path. please use valid 'hdfs://', 'afs://' , 'bos://',"
+ " 'ofs://', 'obs://', 'oss://', 's3a://', 'cosn://', 'gfs://', 'gs://' or 'jfs://' path.");
throw new AnalysisException("Invalid broker path. please use valid 'hdfs://', 'viewfs://', 'afs://',"
+ " 'bos://', 'ofs://', 'obs://', 'oss://', 's3a://', 'cosn://', 'gfs://', 'gs://'"
+ " or 'jfs://' path.");
}
} else if (type == StorageBackend.StorageType.S3 && !schema.equalsIgnoreCase("s3")) {
throw new AnalysisException("Invalid export path. please use valid 's3://' path.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ public class FeConstants {
public static String FS_PREFIX_GFS = "gfs";
public static String FS_PREFIX_JFS = "jfs";
public static String FS_PREFIX_HDFS = "hdfs";
public static String FS_PREFIX_VIEWFS = "viewfs";
public static String FS_PREFIX_FILE = "file";
public static final String INTERNAL_DB_NAME = "__internal_schema";
public static String TEMP_MATERIZLIZE_DVIEW_PREFIX = "internal_tmp_materialized_view_";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ public static Pair<FileSystemType, String> getFSIdentity(String location) {
} else {
fsType = FileSystemType.S3;
}
} else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) || location.startsWith(FeConstants.FS_PREFIX_GFS)) {
} else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) || location.startsWith(FeConstants.FS_PREFIX_GFS)
|| location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) {
fsType = FileSystemType.DFS;
} else if (location.startsWith(FeConstants.FS_PREFIX_OFS) || location.startsWith(FeConstants.FS_PREFIX_COSN)) {
// ofs:// and cosn:// use the same underlying file system: Tencent Cloud HDFS, aka CHDFS)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,8 @@ protected static Optional<TFileType> getTFileType(String location) {
return Optional.of(TFileType.FILE_S3);
} else if (location.startsWith(FeConstants.FS_PREFIX_HDFS)) {
return Optional.of(TFileType.FILE_HDFS);
} else if (location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) {
return Optional.of(TFileType.FILE_HDFS);
} else if (location.startsWith(FeConstants.FS_PREFIX_COSN)) {
return Optional.of(TFileType.FILE_HDFS);
} else if (location.startsWith(FeConstants.FS_PREFIX_FILE)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class FileSystemManager {
.getLogger(FileSystemManager.class.getName());
// supported scheme
private static final String HDFS_SCHEME = "hdfs";
private static final String VIEWFS_SCHEME = "viewfs";
private static final String S3A_SCHEME = "s3a";
private static final String KS3_SCHEME = "ks3";
private static final String CHDFS_SCHEME = "ofs";
Expand Down Expand Up @@ -210,7 +211,7 @@ public BrokerFileSystem getFileSystem(String path, Map<String, String> propertie
"invalid path. scheme is null");
}
BrokerFileSystem brokerFileSystem = null;
if (scheme.equals(HDFS_SCHEME)) {
if (scheme.equals(HDFS_SCHEME) || scheme.equals(VIEWFS_SCHEME)) {
brokerFileSystem = getDistributedFileSystem(path, properties);
} else if (scheme.equals(S3A_SCHEME)) {
brokerFileSystem = getS3AFileSystem(path, properties);
Expand Down
35 changes: 35 additions & 0 deletions regression-test/data/external_table_p2/hive/test_viewfs_hive.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !viewfs --
1 Tom 48 \N male
2 Jerry 35 \N male
3 Frank 25 \N male
4 Ada 22 \N female

-- !viewfs_partition1 --
1 Tom 48 \N male 20230101
2 Jerry 35 \N male 20230101
3 Frank 25 \N male 20230201
4 Ada 22 \N female 20230201

-- !viewfs_partition2 --
1 Tom 48 \N male 20230101
2 Jerry 35 \N male 20230101

-- !viewfs_partition3 --
3 Frank 25 \N male 20230201
4 Ada 22 \N female 20230201

-- !viewfs_mixed_partition1 --
1 Tom 48 \N male 20230101
2 Jerry 35 \N male 20230101
3 Frank 25 \N male 20230201
4 Ada 22 \N female 20230201

-- !viewfs_mixed_partition2 --
1 Tom 48 \N male 20230101
2 Jerry 35 \N male 20230101

-- !viewfs_mixed_partition3 --
3 Frank 25 \N male 20230201
4 Ada 22 \N female 20230201

3 changes: 3 additions & 0 deletions regression-test/data/external_table_p2/tvf/test_tvf_p2.out
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@
-- !row_cross_pages --
25001 25001 25001

-- !viewfs --
25001 25001 25001

Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_viewfs_hive", "p2,external,hive,external_remote,external_remote_hive") {

String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort")
String nameNodeHost = context.config.otherConfigs.get("extHiveHmsHost")
String hdfsPort = context.config.otherConfigs.get("extHdfsPort")
String catalog_name = "test_viewfs_hive"

sql """drop catalog if exists ${catalog_name};"""

sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hadoop.username' = 'hadoop',
'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}',
'fs.viewfs.mounttable.my-cluster.link./ns1' = 'hdfs://${nameNodeHost}:${hdfsPort}/',
'fs.viewfs.mounttable.my-cluster.homedir' = '/ns1',
'fs.defaultFS' = 'viewfs://my-cluster'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """switch ${catalog_name};"""
logger.info("switched to catalog " + catalog_name)

sql """ use viewfs """

// The location of table is on viewfs.
qt_viewfs """ select * from test_viewfs order by id"""

// The location of partition table is on viewfs.
qt_viewfs_partition1 """ select * from test_viewfs_partition order by id"""
qt_viewfs_partition2 """ select * from test_viewfs_partition where part_col = 20230101 order by id"""
qt_viewfs_partition3 """ select * from test_viewfs_partition where part_col = 20230201 order by id"""

// The location of partition table contains hdfs and viewfs locations partitions.
qt_viewfs_mixed_partition1 """ select * from test_viewfs_mixed_partition order by id"""
qt_viewfs_mixed_partition2 """ select * from test_viewfs_mixed_partition where part_col = 20230101 order by id"""
qt_viewfs_mixed_partition3 """ select * from test_viewfs_mixed_partition where part_col = 20230201 order by id"""
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,14 @@ suite("test_tvf_p2", "p2,external,tvf,external_remote,external_remote_tvf") {
"uri" = "hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/row_cross_pages.parquet",
"format" = "parquet",
"fs.defaultFS" = "hdfs://${nameNodeHost}:${hdfsPort}")"""

// viewfs
qt_viewfs """select count(id), count(m1), count(m2)
from hdfs(
"uri" = "viewfs://my-cluster/ns1/catalog/tvf/parquet/row_cross_pages.parquet",
"format" = "parquet",
"fs.defaultFS" = "viewfs://my-cluster",
"fs.viewfs.mounttable.my-cluster.link./ns1" = "hdfs://${nameNodeHost}:${hdfsPort}/",
"fs.viewfs.mounttable.my-cluster.homedir" = "/ns1")"""
}
}

0 comments on commit 722ca94

Please sign in to comment.