Skip to content

Commit

Permalink
hxl2pandas (#4): created, based on hxl2example
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Jan 26, 2021
1 parent 05e1687 commit 3608e08
Show file tree
Hide file tree
Showing 3 changed files with 309 additions and 4 deletions.
4 changes: 2 additions & 2 deletions bin/hxl2arff
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# COMPANY: EticaAI
# LICENSE: Public Domain dedication
# SPDX-License-Identifier: Unlicense
# VERSION: v1.0
# VERSION: v1.0-draft
# CREATED: 2021-01-25 23:34 UTC v1.0 imported from hxl2example
# REVISION: ---
#===============================================================================
Expand All @@ -39,7 +39,7 @@ STDIN = sys.stdin.buffer
class HXL2Arff:
"""
HXL2Arff is a classe to export already HXLated data in the format
example.
.arff
"""

def __init__(self):
Expand Down
305 changes: 305 additions & 0 deletions bin/hxl2pandas
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
#!/usr/bin/env python3
#===============================================================================
#
# FILE: hxl2pandas
#
# USAGE: cat hxlated-file.csv | hxl2pandas
#
# DESCRIPTION: Important point: both the `hxl2pandas` and the
# EticaAI-Data_HXL-Data-Science-file-formats_Pandas reference
# table are mostly as reference of how pandas (more specifically
# DataFrame) could be used as an intermediate format to export
# HXL to other formats already supported by Pandas.
#
# While the reference table may still be useful for those who
# are doing manual conversionor to help understand how different
# tools used for data mining / machine learningwould use HXL
# attributes, the `hxl2pandas` may not be implemented at all.
# Also some of the intermediate formats may be converted using
# other libraries.
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# - libhxl (https://pypi.org/project/libhxl/)
# BUGS: ---
# NOTES: ---
# AUTHOR: Emerson Rocha <rocha[at]ieee.org>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication
# SPDX-License-Identifier: Unlicense
# VERSION: v1.0-draft
# CREATED: 2021-01-36 03:43 UTC v1.0 imported from hxl2example
# REVISION: ---
#===============================================================================

import sys
import os
import logging
import argparse

# Do not import hxl, to avoid circular imports
import hxl.converters
import hxl.filters
import hxl.io

# In Python2, sys.stdin is a byte stream; in Python3, it's a text stream
STDIN = sys.stdin.buffer


class HXL2Pandas:
"""
HXL2Pandas is a classe to export already HXLated data in the format
example.
"""

def __init__(self):
"""
Constructs all the necessary attributes for the HXL2Pandas object.
"""
self.hxlhelper = None
self.args = None

# Posix exit codes
self.EXIT_OK = 0
self.EXIT_ERROR = 1
self.EXIT_SYNTAX = 2

def make_args_HXL2Pandas(self):

self.hxlhelper = HXLUtils()
parser = self.hxlhelper.make_args(
description=("HXL2Pandas is an example script to create other "
"scripts with some bare minimum command line "
"interfaces that could work to export HXL files to "
"other formats."))

self.args = parser.parse_args()
return self.args

def do_example_output(self, args,
stdin=STDIN, stdout=sys.stdout, stderr=sys.stderr):
"""
The do_example_output is the main entrypoint of HXL2Pandas. When
called will convert the HXL source to example format.
"""

# NOTE: the next lines, in fact, only generate an csv outut. So you
# can use as starting point.
with self.hxlhelper.make_source(args, stdin) as source, \
self.hxlhelper.make_output(args, stdout) as output:
hxl.io.write_hxl(output.output, source,
show_tags=not args.strip_tags)

return self.EXIT_OK


class HXLUtils:
"""
HXLUtils contains functions from the Console scripts of libhxl-python
(HXLStandard/libhxl-python/blob/master/hxl/scripts.py) with few changes
to be used as class (and have one single place to change).
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self):

self.logger = logging.getLogger(__name__)

# Posix exit codes
self.EXIT_OK = 0
self.EXIT_ERROR = 1
self.EXIT_SYNTAX = 2

def make_args(self, description, hxl_output=True):
"""Set up parser with default arguments.
@param description: usage description to show
@param hxl_output: if True (default), include options for HXL output.
@returns: an argument parser, partly set up.
"""
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
'infile',
help='HXL file to read (if omitted, use standard input).',
nargs='?'
)
if hxl_output:
parser.add_argument(
'outfile',
help='HXL file to write (if omitted, use standard output).',
nargs='?'
)
parser.add_argument(
'--sheet',
help='Select sheet from a workbook (1 is first sheet)',
metavar='number',
type=int,
nargs='?'
)
parser.add_argument(
'--selector',
help='JSONPath expression for starting point in JSON input',
metavar='path',
nargs='?'
)
parser.add_argument(
'--http-header',
help='Custom HTTP header to send with request',
metavar='header',
action='append'
)
if hxl_output:
parser.add_argument(
'--remove-headers',
help='Strip text headers from the CSV output',
action='store_const',
const=True,
default=False
)
parser.add_argument(
'--strip-tags',
help='Strip HXL tags from the CSV output',
action='store_const',
const=True,
default=False
)
parser.add_argument(
"--ignore-certs",
help="Don't verify SSL connections (useful for self-signed)",
action='store_const',
const=True,
default=False
)
parser.add_argument(
'--log',
help='Set minimum logging level',
metavar='debug|info|warning|error|critical|none',
choices=['debug', 'info', 'warning', 'error', 'critical'],
default='error'
)
return parser

def add_queries_arg(
self,
parser,
help='Apply only to rows matching at least one query.'
):
parser.add_argument(
'-q',
'--query',
help=help,
metavar='<tagspec><op><value>',
action='append'
)
return parser

def do_common_args(self, args):
"""Process standard args"""
logging.basicConfig(
format='%(levelname)s (%(name)s): %(message)s',
level=args.log.upper())

def make_source(self, args, stdin=STDIN):
"""Create a HXL input source."""

# construct the input object
input = self.make_input(args, stdin)
return hxl.io.data(input)

def make_input(self, args, stdin=sys.stdin, url_or_filename=None):
"""Create an input object"""

if url_or_filename is None:
url_or_filename = args.infile

# sheet index
sheet_index = args.sheet
if sheet_index is not None:
sheet_index -= 1

# JSONPath selector
selector = args.selector

http_headers = self.make_headers(args)

return hxl.io.make_input(
url_or_filename or stdin,
sheet_index=sheet_index,
selector=selector,
allow_local=True,
http_headers=http_headers,
verify_ssl=(not args.ignore_certs)
)

def make_output(self, args, stdout=sys.stdout):
"""Create an output stream."""
if args.outfile:
return FileOutput(args.outfile)
else:
return StreamOutput(stdout)

def make_headers(self, args):
# get custom headers
header_strings = []
header = os.environ.get("HXL_HTTP_HEADER")
if header is not None:
header_strings.append(header)
if args.http_header is not None:
header_strings += args.http_header
http_headers = {}
for header in header_strings:
parts = header.partition(':')
http_headers[parts[0].strip()] = parts[2].strip()
return http_headers


class FileOutput(object):
"""
FileOutput contains is based on libhxl-python with no changes..
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self, filename):
self.output = open(filename, 'w')

def __enter__(self):
return self

def __exit__(self, value, type, traceback):
self.output.close()


class StreamOutput(object):
"""
StreamOutput contains is based on libhxl-python with no changes..
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self, output):
self.output = output

def __enter__(self):
return self

def __exit__(self, value, type, traceback):
pass

def write(self, s):
self.output.write(s)


if __name__ == "__main__":

HXL2Pandas = HXL2Pandas()
args = HXL2Pandas.make_args_HXL2Pandas()

HXL2Pandas.do_example_output(args)
4 changes: 2 additions & 2 deletions bin/hxl2tab
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# COMPANY: EticaAI
# LICENSE: Public Domain dedication
# SPDX-License-Identifier: Unlicense
# VERSION: v1.2
# VERSION: v1.2-draft
# CREATED: 2021-01-24 01:25 UTC
# REVISION: 2021-01-24 02:52 UTC changed from POSIX shell script to python3
# 2021-01-24 23:54 UTC nginxlogs2csv (from Alligo) used as base
Expand All @@ -42,7 +42,7 @@ STDIN = sys.stdin.buffer
class HXL2Tab:
"""
HXL2Tab is a classe to export already HXLated data in the format
example.
.tab
"""

def __init__(self):
Expand Down

0 comments on commit 3608e08

Please sign in to comment.