-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
hxl2pandas (#4): created, based on hxl2example
- Loading branch information
Showing
3 changed files
with
309 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,305 @@ | ||
#!/usr/bin/env python3 | ||
#=============================================================================== | ||
# | ||
# FILE: hxl2pandas | ||
# | ||
# USAGE: cat hxlated-file.csv | hxl2pandas | ||
# | ||
# DESCRIPTION: Important point: both the `hxl2pandas` and the | ||
# EticaAI-Data_HXL-Data-Science-file-formats_Pandas reference | ||
# table are mostly as reference of how pandas (more specifically | ||
# DataFrame) could be used as an intermediate format to export | ||
# HXL to other formats already supported by Pandas. | ||
# | ||
# While the reference table may still be useful for those who | ||
# are doing manual conversionor to help understand how different | ||
# tools used for data mining / machine learningwould use HXL | ||
# attributes, the `hxl2pandas` may not be implemented at all. | ||
# Also some of the intermediate formats may be converted using | ||
# other libraries. | ||
# | ||
# OPTIONS: --- | ||
# | ||
# REQUIREMENTS: - python3 | ||
# - libhxl (https://pypi.org/project/libhxl/) | ||
# BUGS: --- | ||
# NOTES: --- | ||
# AUTHOR: Emerson Rocha <rocha[at]ieee.org> | ||
# COMPANY: EticaAI | ||
# LICENSE: Public Domain dedication | ||
# SPDX-License-Identifier: Unlicense | ||
# VERSION: v1.0-draft | ||
# CREATED: 2021-01-36 03:43 UTC v1.0 imported from hxl2example | ||
# REVISION: --- | ||
#=============================================================================== | ||
|
||
import sys | ||
import os | ||
import logging | ||
import argparse | ||
|
||
# Do not import hxl, to avoid circular imports | ||
import hxl.converters | ||
import hxl.filters | ||
import hxl.io | ||
|
||
# In Python2, sys.stdin is a byte stream; in Python3, it's a text stream | ||
STDIN = sys.stdin.buffer | ||
|
||
|
||
class HXL2Pandas: | ||
""" | ||
HXL2Pandas is a classe to export already HXLated data in the format | ||
example. | ||
""" | ||
|
||
def __init__(self): | ||
""" | ||
Constructs all the necessary attributes for the HXL2Pandas object. | ||
""" | ||
self.hxlhelper = None | ||
self.args = None | ||
|
||
# Posix exit codes | ||
self.EXIT_OK = 0 | ||
self.EXIT_ERROR = 1 | ||
self.EXIT_SYNTAX = 2 | ||
|
||
def make_args_HXL2Pandas(self): | ||
|
||
self.hxlhelper = HXLUtils() | ||
parser = self.hxlhelper.make_args( | ||
description=("HXL2Pandas is an example script to create other " | ||
"scripts with some bare minimum command line " | ||
"interfaces that could work to export HXL files to " | ||
"other formats.")) | ||
|
||
self.args = parser.parse_args() | ||
return self.args | ||
|
||
def do_example_output(self, args, | ||
stdin=STDIN, stdout=sys.stdout, stderr=sys.stderr): | ||
""" | ||
The do_example_output is the main entrypoint of HXL2Pandas. When | ||
called will convert the HXL source to example format. | ||
""" | ||
|
||
# NOTE: the next lines, in fact, only generate an csv outut. So you | ||
# can use as starting point. | ||
with self.hxlhelper.make_source(args, stdin) as source, \ | ||
self.hxlhelper.make_output(args, stdout) as output: | ||
hxl.io.write_hxl(output.output, source, | ||
show_tags=not args.strip_tags) | ||
|
||
return self.EXIT_OK | ||
|
||
|
||
class HXLUtils: | ||
""" | ||
HXLUtils contains functions from the Console scripts of libhxl-python | ||
(HXLStandard/libhxl-python/blob/master/hxl/scripts.py) with few changes | ||
to be used as class (and have one single place to change). | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self): | ||
|
||
self.logger = logging.getLogger(__name__) | ||
|
||
# Posix exit codes | ||
self.EXIT_OK = 0 | ||
self.EXIT_ERROR = 1 | ||
self.EXIT_SYNTAX = 2 | ||
|
||
def make_args(self, description, hxl_output=True): | ||
"""Set up parser with default arguments. | ||
@param description: usage description to show | ||
@param hxl_output: if True (default), include options for HXL output. | ||
@returns: an argument parser, partly set up. | ||
""" | ||
parser = argparse.ArgumentParser(description=description) | ||
parser.add_argument( | ||
'infile', | ||
help='HXL file to read (if omitted, use standard input).', | ||
nargs='?' | ||
) | ||
if hxl_output: | ||
parser.add_argument( | ||
'outfile', | ||
help='HXL file to write (if omitted, use standard output).', | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--sheet', | ||
help='Select sheet from a workbook (1 is first sheet)', | ||
metavar='number', | ||
type=int, | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--selector', | ||
help='JSONPath expression for starting point in JSON input', | ||
metavar='path', | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--http-header', | ||
help='Custom HTTP header to send with request', | ||
metavar='header', | ||
action='append' | ||
) | ||
if hxl_output: | ||
parser.add_argument( | ||
'--remove-headers', | ||
help='Strip text headers from the CSV output', | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
'--strip-tags', | ||
help='Strip HXL tags from the CSV output', | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
"--ignore-certs", | ||
help="Don't verify SSL connections (useful for self-signed)", | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
'--log', | ||
help='Set minimum logging level', | ||
metavar='debug|info|warning|error|critical|none', | ||
choices=['debug', 'info', 'warning', 'error', 'critical'], | ||
default='error' | ||
) | ||
return parser | ||
|
||
def add_queries_arg( | ||
self, | ||
parser, | ||
help='Apply only to rows matching at least one query.' | ||
): | ||
parser.add_argument( | ||
'-q', | ||
'--query', | ||
help=help, | ||
metavar='<tagspec><op><value>', | ||
action='append' | ||
) | ||
return parser | ||
|
||
def do_common_args(self, args): | ||
"""Process standard args""" | ||
logging.basicConfig( | ||
format='%(levelname)s (%(name)s): %(message)s', | ||
level=args.log.upper()) | ||
|
||
def make_source(self, args, stdin=STDIN): | ||
"""Create a HXL input source.""" | ||
|
||
# construct the input object | ||
input = self.make_input(args, stdin) | ||
return hxl.io.data(input) | ||
|
||
def make_input(self, args, stdin=sys.stdin, url_or_filename=None): | ||
"""Create an input object""" | ||
|
||
if url_or_filename is None: | ||
url_or_filename = args.infile | ||
|
||
# sheet index | ||
sheet_index = args.sheet | ||
if sheet_index is not None: | ||
sheet_index -= 1 | ||
|
||
# JSONPath selector | ||
selector = args.selector | ||
|
||
http_headers = self.make_headers(args) | ||
|
||
return hxl.io.make_input( | ||
url_or_filename or stdin, | ||
sheet_index=sheet_index, | ||
selector=selector, | ||
allow_local=True, | ||
http_headers=http_headers, | ||
verify_ssl=(not args.ignore_certs) | ||
) | ||
|
||
def make_output(self, args, stdout=sys.stdout): | ||
"""Create an output stream.""" | ||
if args.outfile: | ||
return FileOutput(args.outfile) | ||
else: | ||
return StreamOutput(stdout) | ||
|
||
def make_headers(self, args): | ||
# get custom headers | ||
header_strings = [] | ||
header = os.environ.get("HXL_HTTP_HEADER") | ||
if header is not None: | ||
header_strings.append(header) | ||
if args.http_header is not None: | ||
header_strings += args.http_header | ||
http_headers = {} | ||
for header in header_strings: | ||
parts = header.partition(':') | ||
http_headers[parts[0].strip()] = parts[2].strip() | ||
return http_headers | ||
|
||
|
||
class FileOutput(object): | ||
""" | ||
FileOutput contains is based on libhxl-python with no changes.. | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self, filename): | ||
self.output = open(filename, 'w') | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, value, type, traceback): | ||
self.output.close() | ||
|
||
|
||
class StreamOutput(object): | ||
""" | ||
StreamOutput contains is based on libhxl-python with no changes.. | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self, output): | ||
self.output = output | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, value, type, traceback): | ||
pass | ||
|
||
def write(self, s): | ||
self.output.write(s) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
HXL2Pandas = HXL2Pandas() | ||
args = HXL2Pandas.make_args_HXL2Pandas() | ||
|
||
HXL2Pandas.do_example_output(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters