diff --git a/main.py b/main.py
index 0b76410..a6ca19b 100644
--- a/main.py
+++ b/main.py
@@ -1,259 +1,507 @@
-import requests
-import time
-import urllib.request
 import json
 import re
-import ssl
-from tqdm import tqdm
+import time
+from typing import Optional, List
+from dataclasses import dataclass, field
+import numpy as np
+import requests
+import pandas as pd
+from tqdm.autonotebook import tqdm
 
 # change these variables to change the fandom instance & character category/ies
 FANDOM_SITE = 'coronationstreet'
 CATEGORY = 'Coronation_Street_characters'
 CATEGORIES = [CATEGORY]
-
 JSON_FILE = f"projects/{FANDOM_SITE}.json"
 FANDOM_URL = f'https://{FANDOM_SITE}.fandom.com'
 API_URL = FANDOM_URL + '/api.php'
-BASE_QUERY_URL = API_URL + '?action=query&format=json&list=categorymembers&cmtitle=Category:'
-URL_SUFFIX = '&cmlimit=500&cmcontinue='
-SSL_CONTEXT = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
 
+NAMESPACES = [
+    ('-2', 'Media'),
+    ('-1', 'Special'),
+    ('0', 'Article'),
+    ('1', 'Talk'),
+    ('2', 'User'),
+    ('3', 'User talk'),
+    ('4', 'Project'),
+    ('5', 'Project talk'),
+    ('6', 'File'),
+    ('7', 'File talk'),
+    ('8', 'MediaWiki'),
+    ('9', 'MediaWiki talk'),
+    ('10', 'Template'),
+    ('11', 'Template talk'),
+    ('12', 'Help'),
+    ('13', 'Help talk'),
+    ('14', 'Category'),
+    ('15', 'Category talk'),
+    ('110', 'Forum'),
+    ('111', 'Forum talk'),
+    ('420', 'GeoJson'),
+    ('421', 'GeoJson talk'),
+    ('500', 'User blog'),
+    ('501', 'User blog comment'),
+    ('502', 'Blog'),
+    ('503', 'Blog talk'),
+    ('710', 'TimedText'),
+    ('711', 'TimedText talk'),
+    ('828', 'Module'),
+    ('829', 'Module talk'),
+    ('1200', 'Message Wall'),
+    ('1201', 'Thread'),
+    ('1202', 'Message Wall Greeting'),
+    ('2000', 'Board'),
+    ('2001', 'Board Thread'),
+    ('2002', 'Topic'),
+    ]
+
+
+def remove_suffix(cell, suffix):
+    if cell and cell.endswith(suffix):
+        l = len(suffix)
+        cell = cell[:-l]
+    else:
+        pass
+    return cell
+
+
+def remove_suffixes(df, col_list, suffix_list):
+    for col in col_list:
+        for suffix in suffix_list:
+            df[col].loc[df[col].str.endswith(suffix, na=False)] = (df[col].loc[df[col].str.endswith(suffix, na=False)]
+                                                         .apply(lambda x: remove_suffix(x, suffix))
+                                                         .str.strip())
+    return df
+
+# These functions are for getting all pages in a category and their infoboxes.
 
 def make_list_chunks(lst, n=50):
     """split a list up into sublist chunks of size n (default 50)"""
     return [lst[i:i + n] for i in range(0, len(lst), n)]
 
-
-def get_category_pages(fullurl=None):
-    """Gets names in category, including `pageid` in tuple (for later matching).
-    Replaces `extractURLs`."""
-    if fullurl is None:
-        fullurl = BASE_QUERY_URL + CATEGORY + URL_SUFFIX
-    # print(fullurl)
-    names = []
-    with urllib.request.urlopen(fullurl, context=SSL_CONTEXT) as url:
-        data = json.loads(url.read().decode())
-        for item in data['query']['categorymembers']:
-            name_tuple = (item['pageid'], item['title'])
-            names.append(name_tuple)
-
-    # the API returns a value under `cmcontinue` if there are more results than listed,
-    # to enable successive queries. if no value there, stop while loop with '1'.
-    cmcontinue = data.get('continue', {}).get('cmcontinue', "1")
-
-    return cmcontinue, names
-
-
-def get_titles(categories=CATEGORIES):
-    '''Gives previous functionality from `if __name__ == 'main'` ability to handle multiple categories.'''
-    titles = []
-    for category in tqdm(categories):
+@dataclass
+class WikiAPI:
+    '''A base class for querying a fandom Wiki'''
+    fandom_site: str = FANDOM_SITE
+    fandom_url: str = FANDOM_URL
+    api_url: str = API_URL
+    category: Optional[str] = CATEGORY
+    categories: Optional[list] = field(default_factory=list)
+    namespaces: List = field(default_factory=list)
+    params: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        self.namespaces = NAMESPACES
+        self.params = {'action': 'query',
+                       'format': 'json',
+                      }
+
+    def scrape(self):
+        pass
+
+    def parse(self):
+        pass
+
+    def build(self):
+        self.scrape()
+        self.parse()
+
+    def get_all_namespaces(self, api_url=API_URL):
+        params = {'action': 'query',
+                  'format': 'json',
+                  'meta': 'siteinfo',
+                  'siprop': 'namespaces',
+                  }
+        r = requests.get(api_url, params=params)
+        data = json.loads(r.text)
+        namespaces = data['query']['namespaces']
+        nses = [(k, v.get('canonical', '*')) for k, v in namespaces.items()]
+        return nses
+
+    def get_all_pages(self, namespace=None):
+        '''Get all pages from a particular namespace (defaults to articles).'''
+        params = {'action': 'query',
+                'format': 'json',
+                'list': 'allpages',
+                'aplimit': '500',
+                'apfilterredir': 'nonredirects',
+                'apcontinue': '0',
+                }
+        if namespace is None:
+            namespace = 0
+        params.update({'apnamespace': namespace})
+        all_pages = []
         cont = "0"
         while cont != "1":
-            url = ''.join([BASE_QUERY_URL, category, URL_SUFFIX, cont])
-#             print(url)
-            cmcontinue, pages = get_category_pages(fullurl=url)
-
-            # prune pages that aren't for specific characters
-            pages_refined = [x for x in pages if "Category:" not in x[1]]
-            titles.extend(pages_refined)
-            cont = cmcontinue
-    return titles
-
-
-def get_raw_infoboxes(titles=None, categories=CATEGORIES, sleep=1):
-    '''From a list of title tuples, get the raw json for their infoboxes. This is a bulk query that
-    only gets the first section of the article instead of the full text. `pageid` is the dict key.
-    Uses tqdm to track progress.'''
-    if titles is None:
-        titles = get_titles(categories=categories)
-    try:
-        assert type(titles) == list
-        assert type(sleep) == int or type(sleep) == float
-    except AssertionError:
-        raise TypeError
-
-    # break up title tuples into chunks of 50 or fewer, max for API infobo xquery
-    title_chunks = make_list_chunks(titles)
-
-    raw_infoboxes = {}
-    for chunk in tqdm(title_chunks):
-        time.sleep(sleep)  # add sleep so don't overwhelm server
-        title_list = '|'.join([x[1] for x in chunk])
-        params={'action': 'query',
-                'titles': title_list,
-                'format': 'json',
-                'prop': 'revisions',
-                'rvprop': 'content',
-                'rvsection': '0',
-                'rvslots': '*'}
-        r = requests.get(API_URL, params=params)
-        # print(r.url)
-        json_values = r.json()
-        pages = json_values['query']['pages']
-
-        boxes = {int(k): v['revisions'][0]['slots']['main']['*'] for k, v in pages.items()}
-        raw_infoboxes.update(boxes)
-
-    return raw_infoboxes
-
-
-def parse_infobox(info_json):
-    '''Adapts parsing functionality from `extractBoxes` to have input of raw json.
-    Outputs to dict format.'''
-    data = {}
-    for line in info_json.splitlines():
-        if line.startswith('|'):
-            val = line.partition('=')[-1].strip() #val  
-            #process value
-            val = val.replace("[[","")
-            val = val.replace("]]","")
-            val = val.replace("}}","")
-            val = val.replace("{{","")
-            val = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", val)
-            val = val.replace("()","") 
-
-            val = val.strip() 
-
-            if any(x in val for x in ['<br />', '<br>', '<br/>']):
-                val = val.replace('<br />', '<br>').replace('<br/>', '<br>')
-                val = val.split('<br>')
-                val = [x.strip() for x in val]
-
-            k = line.partition('=')[0].strip()[1:] #key
-            k = k.strip()
-
-            #process k
-            data[k] = val
-    return data
-
-
-def match_names_to_infoboxes(categories=None,
-                             titles=None,
-                             infoboxes=None):
-    '''Uses `pageid` to match title/name tuple to raw infobox json.'''
-    if not categories:
-        categories = CATEGORIES
-    if not titles:
-        titles = get_titles(categories=categories)
-    if not infoboxes:
-        infoboxes = get_raw_infoboxes(titles)
-
-    raw_infobox_dict = {}
-    page_numbers = [x[0] for x in titles]
-    for pn in page_numbers:
-        title = next(x[1] for x in titles if x[0] == pn)
-        raw_infobox_dict[title] = infoboxes[pn]
-    return raw_infobox_dict
-
-
-def get_parsed_infoboxes(categories=CATEGORIES, titles=None, raw_infoboxes=None):
-    '''Parses the raw infoboxes into dicts from matched title json dict.'''
-    if titles is None and raw_infoboxes is None:
-        titles = get_titles(categories=categories)
-    if raw_infoboxes is None:
-        raw_infoboxes = get_raw_infoboxes(titles=titles)
-
-    matched_infoboxes = match_names_to_infoboxes(titles=titles, infoboxes=raw_infoboxes)
-
-    infoboxes = {k: parse_infobox(v) for k, v in matched_infoboxes.items()}
-    return infoboxes
-
-
-def get_infoboxes(categories=CATEGORIES):
-    '''combines functions to get full pipeline of category list to parsed infoboxes dict,
-    sorted by title.'''
-    print('Retrieving page titles in categories:')
-    titles = get_titles(categories=categories)
-
-    print('\nRetrieving infoboxes for each page title:')
-    infoboxes = get_parsed_infoboxes(titles=titles)
-    return infoboxes
-
-
-def write_infobox_json(categories=CATEGORIES, infoboxes=None, json_file=JSON_FILE):
-    '''Output infobox dict to json file'''
-    if infoboxes is None:
-        infoboxes = get_infoboxes(categories=categories)
-    json_data = json.dumps(infoboxes, indent=4, sort_keys=True)
-    with open(json_file, 'w') as f:
-        f.write(json_data)
-
-
-def extractBox(url=FANDOM_URL,name="Amy_Barlow"):
-    #xyz = 'Stella_Price'
-    print(name)
-    name = '_'.join(name.split())
-    fullurl = '/'.join([url, name])
-    resp = requests.get(fullurl, params={'action': 'raw'})
-    page = resp.text
-
-    data = {}
-    json_data = {}
-
-    for line in page.splitlines():
-        if line.startswith('|'):
-            value = line.partition('=')[-1].strip() #value  
-            #process value
-            value = value.replace("[[","")
-            value = value.replace("]]","")
-            value = value.replace("}}","")
-            value = value.replace("{{","")
-            value = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", value)
-            value = value.replace("()","") 
-
-            value = value.strip() 
-
-            #remove any training white spaces left
-            ##if we have a br the key becomes an array
-
-            if any(x in value for x in ['<br />', '<br>', '<br/>']):
-                value = value.replace('<br />', '<br>').replace('<br/>', '<br>')
-                ##value = value.partition('<br>')
-                ##value = [x for x in value if x != "<br>"]
-                ##value = [x.strip() for x in value]
-                value = value.split('<br>')
-                value = [x.strip() for x in value]
-
-            key = line.partition('=')[0].strip()[1:] #key
-            key = key.strip()
-
-            #process key
+            r = requests.get(API_URL, params=params)
+            data = json.loads(r.text)
+            pages = data['query']['allpages']
+            pages = [(x['pageid'], x['title']) for x in pages]
+            all_pages.extend(pages)
             try:
-                data[key] = value
-            except:
-                data = {}
-                data[key] = value
-
-    #json_data = json.dumps(data, indent=4, sort_keys=True)
-
-
-    return data
-
-
-
-def extractURLs(fullurl):
-    print(fullurl)
-    names = []
-    with urllib.request.urlopen(fullurl, context=SSL_CONTEXT) as url:
-        data = json.loads(url.read().decode())
-        for item in data['query']['categorymembers']: 
-            for key in item:
-                    if key == "title":
-                        names.append(item[key])
-    try:
-        cmcontinue = data['continue']['cmcontinue']
-    except KeyError:
-        cmcontinue = "1"
-    return cmcontinue, names
+                apcontinue = data['continue']['apcontinue']
+            except KeyError:
+                apcontinue = "1"
+            cont = apcontinue
+            params.update({'apcontinue': apcontinue})
+            time.sleep(1)
+        return all_pages
+
+
+@dataclass
+class WikiCategory(WikiAPI):
+    '''Given a category or list of categories, get the subcategories and the pages in those subcategories.
+    Queries the API for both categories & pages at the same time.'''
+    recursive: bool = True
+    group_pages: bool = False
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.params.update({'list': 'categorymembers',
+                            'cmtype': 'subcat|page',
+                            'cmtitle': f'Category:{self.category}',
+                            'cmlimit': 500,
+                            'cmcontinue': '',
+                            })
+        if not self.categories:
+            self.categories = [self.category]
+
+    def scrape(self):
+        self.category_members = self.get_category_members()
+        self.subcats = self.category_members.get('subcats', None)
+        self.pages = self.category_members.get('pages', None)
+        if not self.group_pages:
+            self.pageids = [x[0] for x in self.pages]
+            self.titles = sorted([x[1] for x in self.pages])
+
+    def get_category_members(self, categories=None, recursive=None, group_pages=None, params=None):
+        if categories is None:
+            categories = self.categories
+        if recursive is None:
+            recursive = self.recursive
+        if group_pages is None:
+            group_pages = self.group_pages
+        if params is None:
+            params = self.params
+        items = {}
+        items['categories'] = categories
+        items['subcats'] = []
+        if group_pages:
+            items['pages'] = {}
+        else:
+            items['pages'] = []
+
+        print('Retrieving category members:\n')
+        for category in tqdm(items['categories']):
+            params['cmtitle'] = f'Category:{category}'
+            params['cmcontinue'] = 0
+            while params['cmcontinue'] != 1:
+                r = requests.get(API_URL, params=params)
+                # print(r.url)
+                data = json.loads(r.text)
+                results = data['query']['categorymembers']
+                subcats = [x['title'].replace('Category:', '') for x in results if int(x['ns']) == 14]
+                items['subcats'].extend(subcats)
+                pages = [(x['pageid'], x['title']) for x in results if int(x['ns']) == 0]
+                if group_pages:
+                    if not items['pages'].get(category, None):
+                        items['pages'][category] = []
+                    items['pages'][category].extend(pages)
+                else:
+                    items['pages'].extend(pages)
+                if recursive:
+                    # append new categories to the category list
+                    items['categories'].extend(subcats)
+                if 'batchcomplete' in data.keys():
+                    params['cmcontinue'] = 1
+                else:
+                    params['cmcontinue'] = data['continue']['cmcontinue']
+            time.sleep(1)
+        # prune duplicates (pages likely to re-occur across multiple subcategories)
+        if not group_pages:
+            for k, v in items.items():
+                items[k] = sorted(list(set(v)))
+        return items
+
+
+@dataclass
+class WikiInfobox(WikiAPI):
+    '''Given a list of wikipages, scrape their infoboxes.'''
+    pages: Optional[list] = field(default_factory=list)
+    titles: Optional[list] = field(default_factory=list)
+    recursive: bool = True
+    by_category: bool = True
+    standardize_case: bool = True
+    alert_empty: bool = True
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.params.update({
+            'prop': 'revisions',
+            'rvprop': 'content',
+            'rvsection': '0',
+            'rvslots': '*',
+        })
+        if self.pages and not self.titles:
+            self.titles = [x[1] for x in self.pages]
+
+    def scrape(self):
+        if self.by_category:
+            if not self.categories:
+                self.categories = [self.category]
+            if not self.pages and not self.titles:
+                wikicat = WikiCategory(categories=self.categories, recursive=self.recursive)
+                wikicat.scrape()
+                self.pages = wikicat.pages
+                self.pageids = wikicat.pageids
+                self.titles = wikicat.titles
+            elif not self.titles:
+                self.pageids = [x[0] for x in self.pages]
+                self.titles = [x[1] for x in self.pages]
+        if self.titles:
+            self.params.update({'titles': self.titles})
+            self.raw_infoboxes = self.get_raw_infoboxes()
+            self.matched_raw_infoboxes = self.match_names_to_infoboxes()
+
+    def parse(self):
+        if self.titles:
+            self.unsorted_infoboxes = self.get_parsed_infoboxes()
+            self.infoboxes = self.sort_infoboxes_by_template()
+            self.dfs = self.build_dfs_infobox()
+            if len(self.dfs) == 1:
+                self.df = list(self.dfs.values())[0]
+
+    def get_raw_infoboxes(self, titles=None, params=None):
+        '''From a list of titles, get the raw json for their infoboxes'''
+        if titles is None:
+            titles = self.titles
+        try:
+            assert type(titles) == list
+        except AssertionError:
+            raise TypeError
+        if params is None:
+            params = self.params
+
+        # break up titles into chunks of 50 or fewer
+        title_chunks = make_list_chunks(titles)
+
+        raw_infoboxes = {}
+        print('Retrieving infoboxes for each page title:')
+        for chunk in tqdm(title_chunks):
+            time.sleep(1)  # add sleep so don't overwhelm server
+            title_list = '|'.join([x for x in chunk])
+            params.update({'titles': title_list})
+            r = requests.get(API_URL, params=params)
+            json_values = r.json()
+            pages = json_values['query']['pages']
+            boxes = {int(k): v['revisions'][0]['slots']['main']['*'] for k, v in pages.items() if int(k) > 0}
+            # warn if missing infoboxes
+            missing_boxes = {k: v for k, v in pages.items() if int(k) < 1}
+            if missing_boxes:
+                for v in missing_boxes.values():
+                    print(f"Infobox page missing: {v['title']}")
+            raw_infoboxes.update(boxes)
+
+        return raw_infoboxes
+
+    def process_value(self, val):
+        """within the context of an infobox to be parsed, clean up the value after the '=' sign."""
+        val = val.replace("[[","")
+        val = val.replace("]]","")
+        val = val.replace("}}","")
+        val = val.replace("{{","")
+        val = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", val)
+        val = val.replace("()","")
+
+        val = val.lstrip('*').strip()
+
+        #remove any training white spaces left
+        ##if we have a br the k becomes an array
+
+        if any(x in val for x in ['<br />', '<br>', '<br/>']):
+            val = val.replace('<br />', '<br>').replace('<br/>', '<br>')
+            val = val.split('<br>')
+            val = [x.strip() for x in val]
+
+        # transform true/false to boolean
+        if type(val) == str and val.lower() == 'true':
+            val = True
+        elif type(val) == str and val.lower() == 'false':
+            val = False
+        return val
+
+    def parse_infobox(self, info_json, standardize_case=None):
+        if standardize_case is None:
+            standardize_case = self.standardize_case
+        infoboxes = {}
+        infobox_name = ''
+        k = ''
+        json_lines = info_json.splitlines()
+        for i, line in enumerate(json_lines):
+            is_list = False
+            if re.findall(r'\{\{Infobox.*?', line):
+                infobox_name = re.findall(r'Infobox.*', line)[0].strip().replace('_', ' ')
+                infoboxes[infobox_name] = {}
+            elif line.startswith('|'):
+                # process k
+                k = line.partition('=')[0].strip()[1:]
+                k = k.strip()
+                if self.standardize_case:
+                    k = k.lower()
+
+                # process val
+                val1 = line.partition('=')[-1].strip()
+                val = self.process_value(val1)
+                if type(val) == str and (val1.startswith('*') or not len(val)):
+                    is_list = True
+                    if val1.startswith('*'):
+                        assert len(val1.split('*')) == 2
+                        item_1 = val.lstrip('*').strip()
+                        val = [item_1]
+                    elif json_lines[i+1].startswith('*'):
+                        val = []
+                    else:
+                        is_list = False
+                    if is_list:
+                        assert json_lines[i+1].startswith('*')
+                        counter = 0
+                        idx = i
+                        while counter < 20:
+                            # look ahead for other list members, stopping at next parameter field
+                            if json_lines[idx+1].startswith('*'):
+                                new_item = self.process_value(json_lines[idx+1])
+                                val.append(new_item)
+                                idx += 1
+                                counter += 1
+                            else:
+                                break
+
+                elif type(val) == str:
+                    assert '*' not in val
+
+                #process k
+                if not infobox_name:
+                    print('no infobox name:', k, val[:20])
+                else:
+                    infoboxes[infobox_name][k] = val
+
+        return infoboxes
+
+    def match_names_to_infoboxes(self,
+                                 categories=None,
+                                 pages=None,
+                                 titles=None,
+                                 pageids=None,
+                                 infoboxes=None):
+        '''Uses pageids to match title/name tuple to raw infobox json.'''
+        if categories is None:
+            categories = self.categories
+        if pages is None:
+            pages = self.pages
+        if titles is None:
+            if not hasattr(self, 'titles') or not self.titles:
+                titles = [x[1] for x in pages]
+            else:
+                titles = self.titles
+        if pageids is None:
+            if not hasattr(self, 'pageids') or not self.pageids:
+                pageids = [x[0] for x in pages]
+            else:
+                pageids = self.pageids
+        if infoboxes is None:
+            infoboxes = self.raw_infoboxes
+        matched_raw_infoboxes = {}
+        for pid in pageids:
+            title = next(x[1] for x in pages if x[0] == pid)
+            matched_raw_infoboxes[(pid, title)] = infoboxes[pid]
+        return matched_raw_infoboxes
+
+    def get_parsed_infoboxes(self, titles=None, raw_infoboxes=None, standardize_case=None):
+        '''Parses the raw infoboxes into dicts from matched title json dict.'''
+        if titles is None and raw_infoboxes is None:
+            titles = self.titles
+        if raw_infoboxes is None:
+            raw_infoboxes = self.raw_infoboxes
+        if standardize_case is None:
+            standardize_case = self.standardize_case
+
+        matched_infoboxes = self.match_names_to_infoboxes(titles=titles, infoboxes=raw_infoboxes)
+
+        infoboxes = {k: self.parse_infobox(v, standardize_case=standardize_case) for k, v in matched_infoboxes.items()}
+        return infoboxes
+
+    def get_infoboxes_for_title(self, title, standardize_case=None, parsed=True):
+        """For a single title, get the article infoboxes. Do not use to iterate!
+        Use chunking via `self.get_parsed_infoboxes()` instead."""
+        if standardize_case is None:
+            standardize_case = self.standardize_case
+        title = '_'.join(title.split())
+        fullurl = '/'.join([FANDOM_URL, title])
+        r = requests.get(fullurl, params={'action': 'raw',
+                                          'section': '0',
+                                          'format': 'json',
+                                          })
+        page = r.text
+        if parsed:
+            parsed_infobox = self.parse_infobox(page, standardize_case=standardize_case)
+            return parsed_infobox
+        else:
+            return page
+
+    def write_infobox_json(self, categories=None, df=None):
+        '''Output infobox dict to json file'''
+        if categories is None:
+            categories = self.categories
+        if df is None:
+            df = next(iter(self.dfs.values()))
+        df = df.set_index('page_title', drop=True)
+        json_data = df.to_json(indent=4, orient='index')
+        with open("projects/fandom.json", 'w') as f:
+            f.write(json_data)
+
+    def sort_infoboxes_by_template(self, infoboxes=None, alert_empty=None):
+        if alert_empty is None:
+            alert_empty = self.alert_empty
+        if infoboxes is None:
+            infoboxes = self.unsorted_infoboxes
+        sorted_infoboxes = {}
+        for k, v in infoboxes.items():
+            for infobox_name, infobox in v.items():
+                if not sorted_infoboxes.get(infobox_name, None):
+                    sorted_infoboxes[infobox_name] = {}
+                sorted_infoboxes[infobox_name][k] = infobox
+        if alert_empty:
+            empty = [k for k, v in infoboxes.items() if not v.values()]
+            if len(empty):
+                print(f"These entries are missing infoboxes and will not be in df: {empty}")
+        return sorted_infoboxes
+
+    def build_df_infobox(self, infoboxes):
+        df = pd.DataFrame.from_dict(infoboxes, orient='index')
+        df.index.set_names(["pageid", "page_title"], inplace=True)
+        df = df.reset_index()
+        df.pageid = df.pageid.astype(int)
+        df = df.replace('PAGENAME', np.NaN)
+        return df
+
+    def build_dfs_infobox(self, infoboxes=None):
+        if infoboxes is None:
+            infoboxes = self.infoboxes
+        dfs_dict = {}
+        for infobox_name, val in infoboxes.items():
+            dfs_dict[infobox_name] = self.build_df_infobox(val)
+            df_name = 'df_' + infobox_name.replace('Infobox ', '').lower()
+            setattr(self, df_name, dfs_dict[infobox_name])
+        return dfs_dict
 
 
 if __name__ == "__main__":
     print(f'Getting {CATEGORIES} infoboxes from fandom site {FANDOM_SITE}\n')
-    # create infobox dict
-    infoboxes = get_infoboxes(categories=CATEGORIES)
+    # create WikiInfobox instance with default values
+    wi = WikiInfobox(categories=CATEGORIES, recursive=False)
+    wi.build()
 
-    # output infobox dict to json file
-    print('Writing infoboxes to .json file\n')
-    write_infobox_json(infoboxes=infoboxes)
+    # output primary infobox dataframe to json file
+    print(f'Writing infoboxes to {JSON_FILE}\n')
+    wi.write_infobox_json()