This document describes an implementation of a weewiki
keyword scraper. This keyword scraper goes through every
wiki page and looks for any word or phrase surrounded by
the equal signs
character. The output information will
be written to standard output. From there, this text can
be trivially parsed by external programs.
This compiles to a single file, called keyscrape.c
.
#include <stdio.h>
#include <stdlib.h>
#include <sqlite3.h>
#include <string.h>
#include "orgparse.h"
#include "weewiki.h"
<<funcdefs>>
<<funcs>>
The function ww_keyscrape
is a public function designed to
work with the main weewiki CLI.
int ww_keyscrape(int argc, char *argv[]);
int ww_keyscrape(int argc, char *argv[])
{
parse_all_pages();
return 0;
}
A single weewiki page is parsed with parse_file
.
At this point, it is assumed the page name key
and
it’s contents val
are loaded up into memory.
static void parse_file(const char *key, size_t keysz,
const char *val, size_t valsz,
FILE *fp);
Parsing happens on a line-by-line. It reads up to the next line break, then parses that chunk. At the end, anything remaining will be parsed.
Each line is checked for enclosing equal signs characters. The left equal must not have a space following it, and the right equal must not have a space preceding it.
If such a pattern is found, the content inside is captured.
This is considered to be a “word”, even if there are
multiple words contained in it. The data is then written
to the specified filehandle, presumably stdout
.
Content in between source blocks in worgle are ignored.
A global ignore flag will be used. It will be set
when a line finds a BEGIN_SRC
declaration, and
will turn off when it finds an END_SRC
declaration.
static void parse_file(const char *key, size_t keysz,
const char *val, size_t valsz,
FILE *fp)
{
size_t i;
size_t start;
int linepos;
int ignore;
const char *fs;
start = 0;
linepos = 0;
ignore = 0;
fs = "\t";
for (i = 0; i < valsz; i++) {
char c;
c = val[i];
if (c == '\n') {
size_t k;
size_t wordlen;
int wordstarted;
size_t wordpos;
linepos++;
if (!strncmp(&val[start], "#+BEGIN_SRC", 11)) {
ignore = 1;
} else if (!strncmp(&val[start], "#+END_SRC", 9)) {
ignore = 0;
start = i + 1;
continue;
};
if (ignore) {
start = i + 1;
continue;
}
wordlen = 0;
wordstarted = 0;
wordpos = 0;
for (k = start; k < i; k++) {
char s;
s = val[k];
if (s == '=') {
if (wordstarted) {
wordstarted = 0;
fwrite(key, 1, keysz, fp);
fprintf(fp, fs);
fwrite(&val[wordpos], 1, wordlen, fp);
fprintf(fp, "%s%d\n", fs, linepos);
} else {
if ((k + 1) < valsz && val[k + 1] != ' ') {
wordstarted = 1;
wordlen = 0;
wordpos = k + 1;
}
}
} else if (wordstarted) {
wordlen++;
}
}
start = i + 1;
}
}
}
This is a process that gets applied to each page, and works
very similarly to the export
command.
static void parse_all_pages(void);
static void parse_all_pages(void)
{
weewiki_d *ww;
sqlite3_stmt *stmt;
sqlite3 *db;
int rc;
<<init_and_open>>
<<create_sqlite_statement>>
<<iterate_through_pages>>
<<cleanup>>
}
Before any iteration can begin, a few bits of initialization. The database is opened.
ww = malloc(weewiki_sizeof());
weewiki_init(ww);
weewiki_open(ww, "a.db");
db = weewiki_db(ww);
All the pages in the wiki
table are queried, their keys
and values extracted. The keys and values are fed into the
analyzer. First, the SQLite statment is created, then
it is stepped through.
sqlite3_prepare_v2(db,
"SELECT key, value "
"FROM wiki "
"WHERE key NOT LIKE \"@%\";",
-1,
&stmt,
NULL);
rc = sqlite3_step(stmt);
while (rc == SQLITE_ROW) {
const char *key;
size_t keysz;
const char *val;
size_t valsz;
key = (const char *)sqlite3_column_text(stmt, 0);
keysz = sqlite3_column_bytes(stmt, 0);
val = (const char *)sqlite3_column_text(stmt, 1);
valsz = sqlite3_column_bytes(stmt, 1);
parse_file(key, keysz, val, valsz, stdout);
rc = sqlite3_step(stmt);
}
At cleanup, the database is closed, and the statment cleaned up.
sqlite3_finalize(stmt);
weewiki_close(ww);
weewiki_clean(ww);
free(ww);