Skip to content

gee_catalog

gee_catalog #1289

Workflow file for this run

name: gee_catalog
on:
workflow_dispatch:
schedule:
- cron: '15 0 * * *'
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: checkout repo content
uses: actions/checkout@v2 # checkout the repository content to github runner
- name: setup python
uses: actions/setup-python@v2
with:
python-version: '3.8' # install the python version needed
- name: install python packages
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade pip
pip install -U pip setuptools
pip install pendulum beautifulsoup4 requests natsort
- name: Script check
uses: jannekem/run-python-script-action@v1
with:
script: |
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json
import urllib
from natsort import natsorted
now = datetime.now()
catalog=[]
i = 1
def parseurl(url):
global i
gparse = []
sublinks = []
try:
response = requests.get(url)
r = response.json()
gee_id = r["id"]
print(f"{i}: {gee_id}")
gee_title = r["title"]
gee_type = r["gee:type"]
gee_start = r["extent"]["temporal"]["interval"][0][0].split("T")[0]
if not r["extent"]["temporal"]["interval"][0][1] == None:
gee_end = r["extent"]["temporal"]["interval"][0][1].split("T")[0]
else:
gee_end = now.strftime("%Y-%m-%d")
gee_start_year = gee_start.split("-")[0]
gee_end_year = gee_end.split("-")[0]
gee_provider = r["providers"][0]["name"]
gee_tags = r["keywords"]
thumbnail_url = [
assets["href"] for assets in r["links"] if assets["rel"] == "preview"
][0]
asset_url = [
assets["href"] for assets in r["links"] if assets["rel"] == "license"
][0]
asset = {
"id": gee_id,
"provider": gee_provider,
"title": gee_title,
"start_date": gee_start,
"end_date": gee_end,
"startyear": gee_start_year,
"endyear": gee_end_year,
"type": gee_type,
"tags": ", ".join(gee_tags),
"asset_url": asset_url,
"thumbnail_url": thumbnail_url,
}
catalog.append(asset)
i = i + 1
except Exception as e:
print(url)
print(e)
asset_list = []
def yf(url):
page = requests.get(url)
if page.status_code == 200:
features = [
assets["href"]
for assets in page.json()["links"]
if assets["rel"] == "child"
]
for feature in features:
if not feature.endswith("catalog.json"):
asset_list.append(feature)
else:
yf(feature)
yf(url="https://earthengine-stac.storage.googleapis.com/catalog/catalog.json")
item_list = natsorted(list(set(asset_list)))
try:
for items in item_list:
parseurl(items)
except Exception as e:
print(f"Failed for {items}")
print(e)
finally:
with open("gee_catalog.json", "w") as file:
json.dump(catalog, file, indent=4, sort_keys=True)
- name: json2csv
uses: jannekem/run-python-script-action@v1
with:
script: |
import csv
import json
with open('gee_catalog.json') as f:
data = json.load(f)
with open('gee_catalog.csv','w') as csvfile:
writer=csv.DictWriter(csvfile,fieldnames=["id", "provider", "title", "start_date","end_date", "startyear","endyear","type","tags","asset_url","thumbnail_url"], delimiter=',',lineterminator='\n')
writer.writeheader()
for datasets in data:
gee_id = datasets['id']
gee_provider = datasets['provider']
gee_title = datasets['title']
gee_start = datasets['start_date']
gee_end = datasets['end_date']
gee_start_year = datasets['startyear']
gee_end_year = datasets['endyear']
gee_type = datasets['type']
gee_tags = datasets['tags']
asset_url = datasets['asset_url']
thumbnail_url = datasets['thumbnail_url']
with open('gee_catalog.csv','a') as csvfile:
writer=csv.writer(csvfile,delimiter=',',lineterminator='\n')
writer.writerow([gee_id,gee_provider,gee_title,gee_start,gee_end,gee_start_year,gee_end_year,gee_type,gee_tags,asset_url,thumbnail_url])
csvfile.close()
- name: file_check
run: ls -l -a
- name: commit files
continue-on-error: true
run: |
today=$(date +"%Y-%m-%d")
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
git add -A
git commit -m "updated datasets ${today} UTC" -a
git pull origin master
- name: push changes
continue-on-error: true
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
branch: master