ciscat_xml2csv.py

#!/usr/bin/python3

import re
import argparse
import xml.etree.ElementTree as etree
import html2text

__author__ = "Xavier Garceau-Aranda"
__license__ = "GPL v2"
__version__ = "0.2"
__maintainer__ = "Xavier Garceau-Aranda"
__email__ = "xavier.garceau-aranda@owasp.org"
__status__ = "Testing"

# Parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--xml', '-x', required=True, type=str,
                    help='Input XML file generated by CISCAT')
parser.add_argument('--csv', '-c', required=True, type=str,
                    help='Output CSV file generated by this script')
args = parser.parse_args()

# This is the "depth" of the branches we want to use. Default is 1 but in some
# cases we might want to go "deeper".
group_title_depth = 1

print('[+] Processing file %s' % args.xml)

tree = etree.parse(args.xml)
root = tree.getroot()

"""
First step is to build a dict with Rule ids and pass/fail results

XML structure:
<Benchmark xmlns="http://checklists.nist.gov/xccdf/1.2"
   <TestResult end-time="2015-12-18T10:10:51.776+01:00">
         <rule-result idref="xccdf_org.cisecurity.benchmarks_rule_1.1.1_L1_ ...
         Set_Enforce_password_history_to_24_or_more_passwords"
"""

result_dict = dict()  # contains id-pass/fail/error/notselected results

for child in root:  # iterate over root
    if 'TestResult' in child.tag:  # TestResult contains all the results
        for i in child:
            if 'rule-result' in i.tag:  # each rule-result contains one result
                idref = i.get('idref')  # Rule id
                for j in i:
                    if 'result' in j.tag:  # result
                        result_dict[idref] = j.text

"""
Second step is to parse all Groups and their content, and cross-reference with
the result dict for pass/fail/error/notselected result.

XML structure:
<Benchmark xmlns="http://checklists.nist.gov/xccdf/1.2"
    <Group id="xccdf_org.cisecurity.benchmarks_group_1_Account_Policies">
        <Group id="xccdf_org.cisecurity.benchmarks_group_1.1_Password_Policy">
            <Rule id="xccdf_org.cisecurity.benchmarks_rule_1.1.1_L1_Set_ ...
            Enforce_password_history_to_24_or_more_passwords">
"""


class Entry(object):
    """Entry object, containing all the information for a control
        branch the name of the top-level branch for the control
        number the node number
        control the name of the control
        result the pass/fail/error result of the control
        description the description of the control
        remediation the remediation for the control
    """
    def __init__(self,
                 branch, number, control, result, description, remediation):
        self.branch = branch
        self.number = number
        self.control = control
        self.result = result
        self.description = description
        self.remediation = remediation

    def get_csv_string(self):
        # not sending description as it isnt perfect
        if self.result == 'pass':
            remediation_text = ''
        else:
            remediation_text = self.remediation
        return('"%s","%s","%s","%s"\n' % (
            self.branch,
            self.control,
            self.result,
            remediation_text))


def description_node_to_text(node):
    """
        Takes an XML node containing nested strings and returns a single string.
    """

    h = html2text.HTML2Text()
    h.ignore_links = True
    html = etree.tostring(node)

    text = str(h.handle(str(html))
                .replace('b\'\\n ', '')
                .replace('\\n \\n \'', '')
                .replace('\\n \\n ', '')
                .replace('\\n ', '')
                .replace('\\n \\n', '')
                .replace('\\n', '')
                .replace('"', '\'')
                .replace('\\\'', '\'')
                .strip())

    # for some reason some text end with an '
    if text[-1] == '\'':
        text = text[:-2]

    return text


def remediation_node_to_text(node):
    """
        Takes an XML node containing nested strings and returns a single string.
    """

    h = html2text.HTML2Text()
    h.ignore_links = True
    html = etree.tostring(node)

    """
    text = str(h.handle(str(html))
                .replace('\\n', ' ')
                .replace('b\'', ' ')
                .replace('\\\\', '\\'))
    """

    text = str(h.handle(str(html))
                .replace('b\'\\n ', '')
                .replace('\\n \\n \'', '')
                .replace('\\n \\n ', '')
                .replace('\\n ', '')
                .replace('\\n \\n', '')
                .replace('\\n', '')
                .replace('"', '\'')
                .replace('\\\'', '\'')
                .replace('\\\\\\', '\\')
                .strip())

    text = re.sub(r'\s+', ' ', text)

    text = text.split("Impact")[0]

    if text[0] == ' ':
        text = text[1:]
    if text[-1] == ' ':
        text = text[:-1]
    if text[-1] != '.':
        text += '.'

    text = text.replace('Computer Configuration', '\r\nComputer Configuration')

    return text


def recursive_iter_over_group(node, level):
    """
        As each group (branch) can contain either rules or sub-groups
        (sub-branches), we use a recursive function to iterate over each
        group/sub-group.
    """
    global entry_list
    global group_title
    global group_title_depth
    for child in node:
        if 'title' in child.tag:
            if level == group_title_depth:
                group_title = child.text
        elif 'description' in child.tag:
            #    group_description = '-'
            #    group_description = recursive_get_string(child)
            pass
        elif 'Rule' in child.tag:
            rule_id = child.get('id')
            for i in child:
                if 'title' in i.tag:
                    #   rule_title = i.text.replace('\n            ', ' ')
                    rule_title = re.sub(r'\s+', ' ', i.text)
                elif 'description' in i.tag:
                    if len(i):
                        rule_description = description_node_to_text(i)
                    else:
                        rule_description = i.text
                elif 'fixtext' in i.tag:
                    rule_remediation = '-'
                    rule_remediation = remediation_node_to_text(i)

                rule_number = rule_id.split('_')[3]
                rule_result = result_dict[rule_id]

            # TODO make sure all values are set
            if rule_result != 'notselected' and \
                    rule_result != 'notchecked' and \
                    rule_result != 'unknown':
                new_entry = Entry(group_title,
                                  rule_number,
                                  rule_title,
                                  rule_result,
                                  rule_description,
                                  rule_remediation)
                entry_list.append(new_entry)

        elif 'Group' in child.tag:
            recursive_iter_over_group(child, level+1)
        else:  # unhandled case
            print('[-] Unhandled tag %s.' % child.tag)

entry_list = []
for child in root:  # iterate over root
    if 'Group' in child.tag:  # each Group is a branch
        recursive_iter_over_group(child, 0)  # recursive iterate over each group

"""
Third step is to create a csv file with all the info
"""

print('[+] Generation file %s' % args.csv)

f = open(args.csv, 'wb')

f.write(bytes('Category, Title, Result, Remediation\n', 'UTF-8'))
for entry in entry_list:
    f.write(bytes(entry.get_csv_string(), 'UTF-8'))

f.close()