Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cumulative fixes and updates #97

Merged
merged 11 commits into from
Apr 11, 2023
13 changes: 5 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,28 @@ The following datasets are supported:

| Dataset Name | Dataset ID |
|-|-|
| Landsat 5 TM Collection 1 Level 1 | `landsat_tm_c1` |
| Landsat 5 TM Collection 2 Level 1 | `landsat_tm_c2_l1` |
| Landsat 5 TM Collection 2 Level 2 | `landsat_tm_c2_l2` |
| Landsat 7 ETM+ Collection 1 Level 1 | `landsat_etm_c1` |
| Landsat 7 ETM+ Collection 2 Level 1 | `landsat_etm_c2_l1` |
| Landsat 7 ETM+ Collection 2 Level 2 | `landsat_etm_c2_l2` |
| Landsat 8 Collection 1 Level 1 | `landsat_8_c1` |
| Landsat 8 Collection 2 Level 1 | `landsat_ot_c2_l1` |
| Landsat 8 Collection 2 Level 2 | `landsat_ot_c2_l2` |
| Sentinel 2A | `sentinel_2a` |

| Landsat 9 Collection 2 Level 1 | `landsat_ot_c2_l1` |
| Landsat 9 Collection 2 Level 2 | `landsat_ot_c2_l2` |

# Quick start

Searching for Landsat 5 TM scenes that contains the location (12.53, -1.53) acquired during the year 1995.

```
landsatxplore search --dataset LANDSAT_TM_C1 --location 12.53 -1.53 \
landsatxplore search --dataset landsat_tm_c2_l1 --location 12.53 -1.53 \
--start 1995-01-01 --end 1995-12-31
```

Search for Landsat 7 ETM scenes in Brussels with less than 5% of clouds. Save the returned results in a `.csv` file.

```
landsatxplore search --dataset LANDSAT_ETM_C1 \
landsatxplore search --dataset landsat_tm_c2_l2 \
--location 50.83 4.38 --clouds 5 > results.csv
```

Expand Down Expand Up @@ -181,7 +178,7 @@ api = API(username, password)

# Search for Landsat TM scenes
scenes = api.search(
dataset='landsat_tm_c1',
dataset='landsat_tm_c2_l1',
latitude=50.85,
longitude=-4.35,
start_date='1995-01-01',
Expand Down
10 changes: 8 additions & 2 deletions landsatxplore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,9 @@ def search(
"--timeout", "-t", type=click.INT, default=300, help="Download timeout in seconds."
)
@click.option("--skip", is_flag=True, default=False)
@click.option("--overwrite", is_flag=True, default=False)
@click.argument("scenes", type=click.STRING, nargs=-1)
def download(username, password, dataset, output, timeout, skip, scenes):
def download(username, password, dataset, output, timeout, skip, overwrite, scenes):
"""Download one or several scenes."""
ee = EarthExplorer(username, password)
output_dir = os.path.abspath(output)
Expand All @@ -162,7 +163,12 @@ def download(username, password, dataset, output, timeout, skip, scenes):
if not ee.logged_in():
ee = EarthExplorer(username, password)
fname = ee.download(
scene, output_dir, dataset=dataset, timeout=timeout, skip=skip
scene,
output_dir,
dataset=dataset,
timeout=timeout,
skip=skip,
overwrite=overwrite,
)
if skip:
click.echo(fname)
Expand Down
131 changes: 93 additions & 38 deletions landsatxplore/earthexplorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,31 +20,24 @@

# IDs of GeoTIFF data product for each dataset
DATA_PRODUCTS = {
"landsat_tm_c1": "5e83d08fd9932768",
"landsat_etm_c1": "5e83a507d6aaa3db",
"landsat_8_c1": "5e83d0b84df8d8c2",
"landsat_tm_c2_l1": "5e83d0a0f94d7d8d",
"landsat_etm_c2_l1": "5e83d0d0d2aaa488",
"landsat_ot_c2_l1": "5e81f14ff4f9941c",
"landsat_tm_c2_l2": "5e83d11933473426",
"landsat_etm_c2_l2": "5e83d12aada2e3c5",
"landsat_ot_c2_l2": "5e83d14f30ea90a9",
"sentinel_2a": "5e83a42c6eba8084",
# Level 1 datasets
"landsat_tm_c2_l1": ["5e81f14f92acf9ef", "5e83d0a0f94d7d8d", "63231219fdd8c4e5"],
"landsat_etm_c2_l1":[ "5e83d0d0d2aaa488", "5e83d0d08fec8a66"],
"landsat_ot_c2_l1": ["5e81f14ff4f9941c", "5e81f14f92acf9ef"],
# Level 2 datasets
"landsat_tm_c2_l2": ["5e83d11933473426", "5e83d11933473426", "632312ba6c0988ef"],
"landsat_etm_c2_l2": ["5e83d12aada2e3c5", "5e83d12aed0efa58", "632311068b0935a8"],
"landsat_ot_c2_l2": ["5e83d14f30ea90a9", "5e83d14fec7cae84", "632210d4770592cf"]
}


def _get_tokens(body):
"""Get `csrf_token` and `__ncforminfo`."""
def _get_token(body):
"""Get `csrf_token`."""
csrf = re.findall(r'name="csrf" value="(.+?)"', body)[0]
ncform = re.findall(r'name="__ncforminfo" value="(.+?)"', body)[0]


if not csrf:
raise EarthExplorerError("EE: login failed (csrf token not found).")
if not ncform:
raise EarthExplorerError("EE: login failed (ncforminfo not found).")

return csrf, ncform

return csrf

class EarthExplorer(object):
"""Access Earth Explorer portal."""
Expand All @@ -63,12 +56,11 @@ def logged_in(self):
def login(self, username, password):
"""Login to Earth Explorer."""
rsp = self.session.get(EE_LOGIN_URL)
csrf, ncform = _get_tokens(rsp.text)
csrf = _get_token(rsp.text)
payload = {
"username": username,
"password": password,
"csrf": csrf,
"__ncforminfo": ncform,
}
rsp = self.session.post(EE_LOGIN_URL, data=payload, allow_redirects=True)

Expand All @@ -78,8 +70,10 @@ def login(self, username, password):
def logout(self):
"""Log out from Earth Explorer."""
self.session.get(EE_LOGOUT_URL)

def _download(self, url, output_dir, timeout, chunk_size=1024, skip=False):

def _download(
self, url, output_dir, timeout, chunk_size=1024, skip=False, overwrite=False
):
"""Download remote file given its URL."""
# Check availability of the requested product
# EarthExplorer should respond with JSON
Expand All @@ -93,30 +87,77 @@ def _download(self, url, output_dir, timeout, chunk_size=1024, skip=False):
download_url = r.json().get("url")

try:
local_filename, filesize = self._get_fileinfo(
download_url, timeout=timeout, output_dir=output_dir
)

if skip:
return local_filename

headers = {}
file_mode = "wb"
downloaded_bytes = 0
file_exists = os.path.exists(local_filename)

if file_exists and not overwrite:
downloaded_bytes = os.path.getsize(local_filename)
headers = {"Range": f"bytes={downloaded_bytes}-"}
file_mode = "ab"
if file_exists and downloaded_bytes == filesize:
# assert file is already complete
return local_filename

with self.session.get(
download_url, stream=True, allow_redirects=True, timeout=timeout
download_url,
stream=True,
allow_redirects=True,
headers=headers,
timeout=timeout,
) as r:
file_size = int(r.headers.get("Content-Length"))
with tqdm(
total=file_size, unit_scale=True, unit="B", unit_divisor=1024
total=filesize,
unit_scale=True,
unit="B",
unit_divisor=1024,
initial=downloaded_bytes
) as pbar:
local_filename = r.headers["Content-Disposition"].split("=")[-1]
local_filename = local_filename.replace('"', "")
local_filename = os.path.join(output_dir, local_filename)
if skip:
return local_filename
with open(local_filename, "wb") as f:
with open(local_filename, file_mode) as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
pbar.update(chunk_size)
return local_filename

except requests.exceptions.Timeout:
raise EarthExplorerError(
"Connection timeout after {} seconds.".format(timeout)
)
return local_filename

def download(self, identifier, output_dir, dataset=None, timeout=300, skip=False):
def _get_fileinfo(self, download_url, timeout, output_dir):
"""Get file name and size given its URL."""
try:
with self.session.get(
download_url, stream=True, allow_redirects=True, timeout=timeout
) as r:
file_size = int(r.headers.get("Content-Length"))
local_filename = r.headers["Content-Disposition"].split("=")[-1]
local_filename = local_filename.replace('"', "")
local_filename = os.path.join(output_dir, local_filename)
except requests.exceptions.Timeout:
raise EarthExplorerError(
"Connection timeout after {} seconds.".format(timeout)
)
return local_filename, file_size

def download(
self,
identifier,
output_dir,
dataset=None,
timeout=300,
skip=False,
overwrite=False,
):
"""Download a Landsat scene.

Parameters
Expand Down Expand Up @@ -144,8 +185,22 @@ def download(self, identifier, output_dir, dataset=None, timeout=300, skip=False
entity_id = self.api.get_entity_id(identifier, dataset)
else:
entity_id = identifier
url = EE_DOWNLOAD_URL.format(
data_product_id=DATA_PRODUCTS[dataset], entity_id=entity_id
)
filename = self._download(url, output_dir, timeout=timeout, skip=skip)
# Cycle through the available dataset ids until one works
dataset_id_list = DATA_PRODUCTS[dataset]
id_num = len(dataset_id_list)
for id_count, dataset_id in enumerate(dataset_id_list):
try:
url = EE_DOWNLOAD_URL.format(
data_product_id=dataset_id, entity_id=entity_id
)
filename = self._download(
url, output_dir, timeout=timeout, skip=skip, overwrite=overwrite
)
except EarthExplorerError:
if id_count+1 < id_num:
print('Download failed with dataset id {:d} of {:d}. Re-trying with the next one.'.format(id_count+1, id_num))
Comment on lines +200 to +201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only one thing: This warning is somehow confusing (to me):

Suggested change
if id_count+1 < id_num:
print('Download failed with dataset id {:d} of {:d}. Re-trying with the next one.'.format(id_count+1, id_num))
if id_count+1 < id_num:
print('Download failed with dataset id {:d} of {:d}. Re-trying with the next one.'.format(id_count+1, id_num))

In my test (#97 (comment)) it shows up while all scenes seem to be found and downloaded.

It seems to appear with every download, not sure why... Maybe an index issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is related to the API policy which restricts access to the download-option function to users who have explicitly asked API access. The details are in this commit f31685c. If one could access this function without special permissions, one could always use the download-option function and get the correct dataset-id there. Unfortunately this is not the case, and to avoid forcing all users to ask for API access, the compromise solution was to hard-code all known dataset ids into the library and try different ids until one works.

I understand that printing those statements may be confusing, and probably it would be better to just try all dataset ids, and just in case each and every one of them fails, raise an error or a warning to communicate to the user that the dataset id list needs to be updated by the maintainer.

pass
else:
print('None of the archived ids succeeded! Update necessary!')
raise EarthExplorerError()
return filename
9 changes: 5 additions & 4 deletions landsatxplore/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,16 @@ def parse_scene_id(scene_id):
}


def landsat_dataset(satellite, collection="c1", level="l1"):
def landsat_dataset(satellite, collection="c2", level="l1"):
"""Get landsat dataset name."""
if satellite == 5:
sensor = "tm"
collection = "c2"
elif satellite == 7:
sensor = "etm"
elif satellite == 8 and collection == "c1":
sensor = "8"
elif satellite == 8 and collection == "c2":
elif satellite in (8, 9) and collection == "c1":
raise ValueError('Collection 1 was decommissioned!')
elif satellite in [8, 9] and collection == "c2":
sensor = "ot"
else:
raise LandsatxploreError("Failed to guess dataset from identifier.")
Expand Down