Skip to content

Commit

Permalink
Merge pull request #11310 from rouault/fix_11309
Browse files Browse the repository at this point in the history
/vsicurl/: fix to allow to read Parquet partitionned datasets from public Azure container using /vsicurl/
  • Loading branch information
rouault authored Nov 26, 2024
2 parents 248346c + 48e4a28 commit 89c4ca9
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 43 deletions.
21 changes: 21 additions & 0 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3355,6 +3355,27 @@ def test_ogr_parquet_bbox_float32_but_no_covering_in_metadata(use_dataset):
###############################################################################


@gdaltest.enable_exceptions()
@pytest.mark.require_curl
def test_ogr_parquet_overture_from_azure():

if not _has_arrow_dataset():
pytest.skip("Test requires build with ArrowDataset")

url = "https://overturemapswestus2.blob.core.windows.net/release?comp=list&delimiter=%2F&prefix=2024-11-13.0%2Ftheme%3Ddivisions%2Ftype%3Ddivision_area%2F&restype=container"
if gdaltest.gdalurlopen(url, timeout=5) is None:
pytest.skip(reason=f"{url} is down")

with ogr.Open(
"PARQUET:/vsicurl/https://overturemapswestus2.blob.core.windows.net/release/2024-11-13.0/theme=divisions/type=division_area"
) as ds:
lyr = ds.GetLayer(0)
assert lyr.GetFeatureCount() > 0


###############################################################################


@gdaltest.enable_exceptions()
def test_ogr_parquet_write_arrow(tmp_vsimem):

Expand Down
112 changes: 69 additions & 43 deletions port/cpl_vsil_curl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1345,49 +1345,6 @@ vsi_l_offset VSICurlHandle::GetFileSizeOrHeaders(bool bSetError,
if (sWriteFuncHeaderData.pBuffer != nullptr &&
(response_code == 200 || response_code == 206))
{
const char *pzETag =
strstr(sWriteFuncHeaderData.pBuffer, "ETag: \"");
if (pzETag)
{
pzETag += strlen("ETag: \"");
const char *pszEndOfETag = strchr(pzETag, '"');
if (pszEndOfETag)
{
oFileProp.ETag.assign(pzETag, pszEndOfETag - pzETag);
}
}

// Azure Data Lake Storage
const char *pszPermissions =
strstr(sWriteFuncHeaderData.pBuffer, "x-ms-permissions: ");
if (pszPermissions)
{
pszPermissions += strlen("x-ms-permissions: ");
const char *pszEOL = strstr(pszPermissions, "\r\n");
if (pszEOL)
{
bool bIsDir =
strstr(sWriteFuncHeaderData.pBuffer,
"x-ms-resource-type: directory\r\n") != nullptr;
bool bIsFile =
strstr(sWriteFuncHeaderData.pBuffer,
"x-ms-resource-type: file\r\n") != nullptr;
if (bIsDir || bIsFile)
{
oFileProp.bIsDirectory = bIsDir;
std::string osPermissions;
osPermissions.assign(pszPermissions,
pszEOL - pszPermissions);
if (bIsDir)
oFileProp.nMode = S_IFDIR;
else
oFileProp.nMode = S_IFREG;
oFileProp.nMode |=
VSICurlParseUnixPermissions(osPermissions.c_str());
}
}
}

{
char **papszHeaders =
CSLTokenizeString2(sWriteFuncHeaderData.pBuffer, "\r\n", 0);
Expand All @@ -1409,6 +1366,44 @@ vsi_l_offset VSICurlHandle::GetFileSizeOrHeaders(bool bSetError,
{
m_bCached = false;
}

else if (EQUAL(pszKey, "ETag"))
{
std::string osValue(pszValue);
if (osValue.size() >= 2 && osValue.front() == '"' &&
osValue.back() == '"')
osValue = osValue.substr(1, osValue.size() - 2);
oFileProp.ETag = osValue;
}

// Azure Data Lake Storage
else if (EQUAL(pszKey, "x-ms-resource-type"))
{
if (EQUAL(pszValue, "file"))
{
oFileProp.nMode |= S_IFREG;
}
else if (EQUAL(pszValue, "directory"))
{
oFileProp.bIsDirectory = true;
oFileProp.nMode |= S_IFDIR;
}
}
else if (EQUAL(pszKey, "x-ms-permissions"))
{
oFileProp.nMode |=
VSICurlParseUnixPermissions(pszValue);
}

// https://overturemapswestus2.blob.core.windows.net/release/2024-11-13.0/theme%3Ddivisions/type%3Ddivision_area
// returns a x-ms-meta-hdi_isfolder: true header
else if (EQUAL(pszKey, "x-ms-meta-hdi_isfolder") &&
EQUAL(pszValue, "true"))
{
oFileProp.bIsAzureFolder = true;
oFileProp.bIsDirectory = true;
oFileProp.nMode |= S_IFDIR;
}
}
CPLFree(pszKey);
}
Expand Down Expand Up @@ -4890,6 +4885,37 @@ char **VSICurlFilesystemHandlerBase::GetFileList(const char *pszDirname,
if (!bListDir)
return nullptr;

// Deal with publicly visible Azure directories.
if (STARTS_WITH(osURL.c_str(), "https://"))
{
const char *pszBlobCore =
strstr(osURL.c_str(), ".blob.core.windows.net/");
if (pszBlobCore)
{
FileProp cachedFileProp;
GetCachedFileProp(osURL.c_str(), cachedFileProp);
if (cachedFileProp.bIsAzureFolder)
{
const char *pszURLWithoutHTTPS =
osURL.c_str() + strlen("https://");
const std::string osStorageAccount(
pszURLWithoutHTTPS, pszBlobCore - pszURLWithoutHTTPS);
CPLConfigOptionSetter oSetter1("AZURE_NO_SIGN_REQUEST", "YES",
false);
CPLConfigOptionSetter oSetter2("AZURE_STORAGE_ACCOUNT",
osStorageAccount.c_str(), false);
const std::string osVSIAZ(std::string("/vsiaz/").append(
pszBlobCore + strlen(".blob.core.windows.net/")));
char **papszFileList = VSIReadDirEx(osVSIAZ.c_str(), nMaxFiles);
if (papszFileList)
{
*pbGotFileList = true;
return papszFileList;
}
}
}
}

// HACK (optimization in fact) for MBTiles driver.
if (strstr(pszDirname, ".tiles.mapbox.com") != nullptr)
return nullptr;
Expand Down
1 change: 1 addition & 0 deletions port/cpl_vsil_curl_class.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class FileProp
std::string osRedirectURL{};
bool bHasComputedFileSize = false;
bool bIsDirectory = false;
bool bIsAzureFolder = false;
int nMode = 0; // st_mode member of struct stat
bool bS3LikeRedirect = false;
std::string ETag{};
Expand Down

0 comments on commit 89c4ca9

Please sign in to comment.