Merge pull request #48 from gmbrandt/feature/data_fetching

Queries data on GOST REST web service and implements caching.
gmbrandt · May 17, 2022 · 4725438 · 4725438
2 parents 63cfceb + 8629f8d
commit 4725438
Show file tree

Hide file tree

Showing 11 changed files with 1,835 additions and 1,016 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,9 @@
+1.1.1 (2022-05-10)
+------------------
+- Gaia Parser classes now can fetch GOST scanning law from the GOST API. 
+- Downloaded GOST scanning laws are saved to the directory provided.
+- having the examples in an examples/ folder broke the filepaths.
+
 1.1.0 (2022-04-19)
 ------------------
 - Added a new hip2 parser, in special_parse.py, that will add the residual offset and cosmic dispersion of 

diff --git a/...les/GenerateSyntheticGaiaAstrometry.ipynb → GenerateSyntheticGaiaAstrometry.ipynb b/...les/GenerateSyntheticGaiaAstrometry.ipynb → GenerateSyntheticGaiaAstrometry.ipynb
@@ -313,15 +313,7 @@
    "execution_count": 8,
    "id": "ed9b916a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING: AstropyDeprecationWarning: HIP_3850synthetic_gaia_abs_ast.ecsv already exists. Automatically overwriting ASCII files is deprecated. Use the argument 'overwrite=True' in the future. [astropy.io.ascii.ui]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "t.write(simbad_name.replace(' ', '_') + 'synthetic_gaia_abs_ast.ecsv')"
    ]
@@ -356,4 +348,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/README.rst b/README.rst
@@ -38,19 +38,27 @@ while in the root directory of this repo. It can also be installed directly from
 Usage
 -----
 
-HTOF has a rich variety of usages. We encourage the reader to consult the examples/examples.ipynb jupyter notebook
+HTOF has a rich variety of usages. We encourage the reader to consult the examples.ipynb jupyter notebook
 for a set of usage examples (e.g., fitting the standard astrometric model to data, combining astrometric missions).
-However, we also go into a few basic and specific use cases in this readme.
+However, we also go into a few basic and specific use cases in this readme. Also see
+examples_recalibrating_hip2 and GenerateSyntheticGaiaAstrometry for more uses of htof.
 
 If you use HTOF, please cite the zenodo reference (https://doi.org/10.5281/zenodo.4104383) and the source paper (https://arxiv.org/abs/2109.06761)
 
 Usage: Fits without Parallax
 ----------------------------
 The following examples show how one would both load in and fit a line to the astrometric intermediate data
-from either Hipparcos data reduction or Gaia. Gaia requires you to first download a .csv of the
+from either Hipparcos data reduction or Gaia.
+
+Gaia requires the GOST scanning law for the particular star. However, HTOF will download it for you if you do not have
+it. You should provide a valid directory though for htof to save the file into for future use. Currently, for the
+automatic download to work, you must provide a hipparcos name for the source (e.g., 27321).
+
+If the automatic download of the GOST scanning law does not work, or the source does not have a
+hipparcos ID. then you will have to download the GOST file manually with the user interface. Download a .csv of the
 predicted scans and scan epochs from GOST (https://gaia.esac.esa.int/gost/). In particular, using the 'submit for
 events forecast' feature on the website. One should select the widest range of dates
-possible because \codename automatically restricts the predicted epochs of observations
+possible because htof automatically restricts the predicted epochs of observations
 to the desired data release range (e.g., EDR3) and removes any astrometric gaps.
 
 Let ra_vs_epoch, dec_vs_epoch be 1d arrays of ra and dec positions.
@@ -63,6 +71,8 @@ The following lines parse the intermediate data and fit a line.
     from htof.main import Astrometry
     import numpy as np
     astro = Astrometry('GaiaDR2', '027321', 'htof/test/data_for_tests/GaiaDR2/IntermediateData', format='jyear')  # parse
+    # note that if you do not have a GOST csv file with 027321 in the name, inside of
+    # 'htof/test/data_for_tests/GaiaDR2/IntermediateData' , then htof will download it for you automatically!
     ra_vs_epoch = dec_vs_epoch = np.zeros(len(astro.data), dtype=float) # dummy set of ra and dec to fit.
     ra0, dec0, mu_ra, mu_dec = astro.fit(ra_vs_epoch, dec_vs_epoch)
 
@@ -344,7 +354,7 @@ produce a fit which includes parallax. We now do:
     parallax, ra0, dec0, mu_ra, mu_dec = solution_vector
 
 
-For more examples, refer to the `examples/examples.ipynb` Jupyter notebook. There we will make a figure like Figure 3 from the HTOF paper.
+For more examples, refer to the `examples.ipynb` Jupyter notebook. There we will make a figure like Figure 3 from the HTOF paper.
 
 Flagged Sources
 ~~~~~~~~~~~~~~~

diff --git a/examples.ipynb b/examples.ipynb
diff --git a/examples/examples.ipynb b/examples/examples.ipynb
diff --git a/examples/examples_recalibrating_hip2.ipynb → examples_recalibrating_hip2.ipynb b/examples/examples_recalibrating_hip2.ipynb → examples_recalibrating_hip2.ipynb
@@ -80,7 +80,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/gmbrandt/Documents/Repositories/HTOF/htof/main.py:45: UserWarning: You have selected Hip2recalibrated, the recalibrated Hipparcos 2 data. Note that for this, you should be feeding in the filepaths to the Hip21 (Hip2 java tool data), because htof applies the recalibration on-the-fly for each file. As well, be sure to read Brandt et al. 2022 to understand the limitations of using the recalibrated data. \n",
+      "/home/gmbrandt/Documents/Repositories/HTOF/htof/main.py:42: UserWarning: You have selected Hip2recalibrated, the recalibrated Hipparcos 2 data. Note that for this, you should be feeding in the filepaths to the Hip21 (Hip2 java tool data), because htof applies the recalibration on-the-fly for each file. As well, be sure to read Brandt et al. 2022 to understand the limitations of using the recalibrated data. \n",
       "  warnings.warn(f'You have selected {data_choice}, the recalibrated Hipparcos 2 data. Note that for this,'\n"
      ]
     }

diff --git a/htof/parse.py b/htof/parse.py
@@ -12,13 +12,16 @@
 import numpy as np
 import pandas as pd
 from scipy import stats, special
+import requests
 import warnings
 from ast import literal_eval
 import os
 import re
 import glob
 import itertools
 from math import ceil, floor
+import xml.etree.ElementTree as ET
+from datetime import datetime, timedelta
 import pkg_resources
 
 from astropy.time import Time
@@ -178,6 +181,111 @@ def __init__(self, scan_angle=None, epoch=None, residuals=None, inverse_covarian
         self.min_epoch = min_epoch
         self.max_epoch = max_epoch
 
+    def download_gost_data(self, star_id):
+        target = f"HIP{star_id}"
+        # fetch xml text
+        response = self.query_gost_xml(target)
+        if response is None:
+            raise RuntimeError("Downloading the scanning law from GOST failed. Try again later, or download this"
+                               " file manually using the GOST online interface.")
+        # parse xml text to pandas DataFrame
+        data = self.parse_xml(response)
+        # keep first astronomic field hit of each observation
+        data = self.keep_field_hits(data)
+        return data
+
+    def save_gost_data(self, star_id: str, data: pd.DataFrame, intermediate_data_directory: str):
+        fpath = f"HIP{star_id}.csv"
+        path = os.path.join(os.getcwd(), f"{intermediate_data_directory}/{fpath}")
+        os.makedirs(intermediate_data_directory, exist_ok=True)
+        data.to_csv(path, index=False, index_label=False)
+        return None
+
+    def query_gost_xml(self, target):
+        url = f"https://gaia.esac.esa.int/gost/GostServlet?name={target}&service=1"
+        try:
+            with requests.Session() as s:
+                s.get(url)
+                headers = {"Cookie": f"JSESSIONID={s.cookies.get_dict()['JSESSIONID']}"}
+                response = requests.request("GET", url, headers=headers, timeout=180)
+                return response.text
+        except:
+            warnings.warn("Querying the GOST service failed.")
+            return None
+
+    def parse_xml(self, response):
+        columns = ["Target", "ra[rad]", "dec[rad]", "ra[h:m:s]", "dec[d:m:s]", "ObservationTimeAtGaia[UTC]",
+                   "CcdRow[1-7]", "zetaFieldAngle[rad]", "scanAngle[rad]", "Fov[FovP=preceding/FovF=following]",
+                   "parallaxFactorAlongScan", "parallaxFactorAcrossScan", "ObservationTimeAtBarycentre[BarycentricJulianDateInTCB]"]
+        rows = []
+        root = ET.fromstring(response)
+        name = root.find('./targets/target/name').text
+        raR = root.find('./targets/target/coords/ra').text
+        decR = root.find('./targets/target/coords/dec').text
+        raH = root.find('./targets/target/coords/raHms').text
+        decH = root.find('./targets/target/coords/decDms').text
+        for event in root.findall('./targets/target/events/event'):
+            details = event.find('details')
+            observationTimeAtGaia = event.find('eventUtcDate').text
+            ccdRow = details.find('ccdRow').text
+            zetaFieldAngle = details.find('zetaFieldAngle').text
+            scanAngle = details.find('scanAngle').text
+            fov = details.find('fov').text
+            parallaxFactorAl = details.find('parallaxFactorAl').text
+            parallaxFactorAc = details.find('parallaxFactorAc').text
+            observationTimeAtBarycentre = event.find('eventTcbBarycentricJulianDateAtBarycentre').text
+            rows.append([name, raR, decR, raH, decH, observationTimeAtGaia, ccdRow,
+                         zetaFieldAngle, scanAngle, fov, parallaxFactorAl, parallaxFactorAc, observationTimeAtBarycentre])
+        data = pd.DataFrame(rows, columns=columns)
+        data = data.astype({"Target": str,"ra[rad]": float, "dec[rad]": float,"ra[h:m:s]": str,"dec[d:m:s]": str,"ObservationTimeAtGaia[UTC]": str,"CcdRow[1-7]": int,"zetaFieldAngle[rad]": float,"scanAngle[rad]": float,"Fov[FovP=preceding/FovF=following]": str,"parallaxFactorAlongScan": float,"parallaxFactorAcrossScan": float,"ObservationTimeAtBarycentre[BarycentricJulianDateInTCB]": float })
+        return data
+
+    def keep_field_hits(self, data):
+        """ Gost files downloaded from the web through REST contain sequences of ten observations, for every observation of the 
+        star in the scanning law. The first entry is the skymapper CCD hit, and the extra entries are redundant 
+        (the hits for astrometric field CCD's 1 through 9). Only the second observation of each sequence should be saved. 
+        This function saves the second observation (this is the hit on the first astrometric field CCD (AF1)). """
+        format = "%Y-%m-%dT%H:%M:%S.%f"
+        t1 = datetime.strptime(data['ObservationTimeAtGaia[UTC]'][0], format)
+        buffer = timedelta(hours=1)
+        seq = []
+        obs = []
+        for index, row in data.iterrows():
+            rowTime = row['ObservationTimeAtGaia[UTC]']
+            t2 = datetime.strptime(rowTime, format)
+            if(t2-t1 < buffer):
+                seq.append(row)
+            else:
+                t1 = datetime.strptime(row['ObservationTimeAtGaia[UTC]'], format)
+                if len(seq) > 2:
+                    obs.append(seq[1])
+                seq = []
+                seq.append(row)
+        columns = ["Target", "ra[rad]", "dec[rad]", "ra[h:m:s]", "dec[d:m:s]", "ObservationTimeAtGaia[UTC]", "CcdRow[1-7]", "zetaFieldAngle[rad]", "scanAngle[rad]", "Fov[FovP=preceding/FovF=following]", "parallaxFactorAlongScan", "parallaxFactorAcrossScan", "ObservationTimeAtBarycentre[BarycentricJulianDateInTCB]"]
+        data = pd.DataFrame(obs, columns=columns)
+        data = data.reset_index(drop=True)
+        return data
+
+    @staticmethod
+    def gost_file_exists(star_id: str, intermediate_data_directory: str):
+        try: 
+            # TODO fix this so that this function does not throw an error maybe?
+            DataParser.get_intermediate_data_file_path(star_id, intermediate_data_directory)
+            fileexists = True
+        except FileNotFoundError:
+            fileexists = False
+        return fileexists
+
+    def read_intermediate_data_file(self, star_id: str, intermediate_data_directory: str, **kwargs):
+        # search for the file in the intermediate_data_directory
+        fileexists = self.gost_file_exists(star_id, intermediate_data_directory)
+        if fileexists:
+            return super(GaiaData, self).read_intermediate_data_file(star_id, intermediate_data_directory, **kwargs)
+        else:
+            data = self.download_gost_data(str(star_id))
+            self.save_gost_data(str(star_id), data, intermediate_data_directory)
+            return data
+
     def parse(self, star_id, intermediate_data_directory, **kwargs):
         self.meta['star_id'] = star_id
         data = self.read_intermediate_data_file(star_id, intermediate_data_directory,

diff --git a/htof/test/data_for_tests/MockServer/HIP027321.xml b/htof/test/data_for_tests/MockServer/HIP027321.xml
diff --git a/htof/test/test_parse.py b/htof/test/test_parse.py
@@ -347,6 +347,31 @@ def test_eDR3parse_removes_dead_times(self):
         assert len(data._epoch) == 1
         assert np.isclose(data._epoch.iloc[0], 2456893.28785)
         assert np.isclose(data.scan_angle.iloc[0], -1.7804696884345342)
+
+    def test_gost_file_exists(self):
+        assert GaiaData.gost_file_exists(star_id="000000", intermediate_data_directory="htof/test/data_for_tests") == False
+
+        path = "htof/test/data_for_tests/GaiaeDR3/IntermediateData"
+        assert GaiaData.gost_file_exists(
+            star_id="027321", intermediate_data_directory=path
+        )
+
+    @mock.patch('htof.parse.GaiaData.query_gost_xml')
+    def test_fetch_from_web(self, fake_xml_download):
+        comparison_data = GaiaeDR3()
+        comparison_data.parse('27321', 'htof/test/data_for_tests/GaiaeDR3/IntermediateData')
+        # mock out the query to the internet with a pre-downloaded xml reponse.
+        with open('htof/test/data_for_tests/MockServer/HIP027321.xml') as f:
+            response = f.read()
+        fake_xml_download.return_value = response
+        # open up a temporary directory with no GOST files.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            data = GaiaeDR3()
+            data.parse('27321', tmp_dir)
+
+        assert np.allclose(data.julian_day_epoch(), comparison_data.julian_day_epoch(), atol=1/(24*60))
+        assert np.allclose(data.scan_angle, comparison_data.scan_angle, atol=0.01*np.pi/180)
+        assert np.allclose(data.parallax_factors, comparison_data.parallax_factors, atol=0.0001)
 
     def test_scale_along_scan_errors(self):
         test_data_directory = os.path.join(os.getcwd(), 'htof/test/data_for_tests/GaiaDR2/IntermediateData')
@@ -359,6 +384,44 @@ def test_scale_along_scan_errors(self):
         data.scale_along_scan_errs(1/0.2)
         assert np.allclose(data.along_scan_errs, 1)
 
+    @mock.patch('htof.parse.requests.Session', autospec=True)
+    @mock.patch('htof.parse.requests.request')
+    def test_query_gost_xml(self, mock_request, mock_session):
+        # mock_session needs s.get(url) and s.cookies.get_dict() needs to have a JSESSIONID
+        mock_session.return_value = MockSession()
+        mock_request.return_value = MockSession()
+        data = GaiaData()
+        assert data.query_gost_xml('target')
+
+    @mock.patch('htof.parse.requests.Session', autospec=True)
+    def test_query_gost_xml_fails(self, mock_session):
+        # mock_session needs s.get(url) and s.cookies.get_dict() needs to have a JSESSIONID
+        mock_session.return_value = MockSession(pass_url_stage=False)
+        data = GaiaData()
+        assert data.query_gost_xml('target') is None
+
+
+class MockSession(object):
+    cookies = mock.Mock()
+    cookies.get_dict.return_value = {'JSESSIONID': 'session'}
+    text = True
+
+    def __init__(self, pass_url_stage=True):
+        self.pass_url_stage=pass_url_stage
+
+    def get(self, url):
+        if self.pass_url_stage:
+            return ''
+        else:
+            # force an error
+            raise RuntimeError()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        pass
+
 
 def test_write_with_missing_info():
     data = DataParser(scan_angle=np.arange(3), epoch=np.arange(1991, 1994),

diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ astropy>=2.0
 pandas>=0.24.0
 scipy>=1.0.0
 numpy>=1.17
+requests
diff --git a/setup.py b/setup.py
@@ -1,12 +1,12 @@
 from setuptools import setup, find_packages
 
 setup(name='htof',
-      author='G. Mirek Brandt, Daniel Michalik',
-      version='1.1.0',
+      author='G. Mirek Brandt, Daniel Michalik, Gavin K. Hung',
+      version='1.1.1',
       python_requires='>=3.6',
       packages=find_packages(),
       package_dir={'htof': 'htof'},
       package_data={'htof': ['data/*.csv', 'data/*.txt']},
       setup_requires=['pytest-runner'],
-      install_requires=['astropy>=2.0', 'pandas>=0.24.0', 'scipy>=1.0.0', 'numpy>=1.16'],
+      install_requires=['astropy>=2.0', 'pandas>=0.24.0', 'scipy>=1.0.0', 'numpy>=1.16', 'requests'],
       tests_require=['pytest>=3.5'])