Skip to content

Commit

Permalink
Update to ReadStat 1.1.9 (#713)
Browse files Browse the repository at this point in the history
Maintains iconv hack from c1f9f19 and solaris hack from 4a878a1.

* Fix various SAS catalog file reading bugs (fix #529, fix #653, fix #680, fix #696, fix #705).
* Increase maximum SAS page file size to 16MB (fix #697).
* Ignore invalid SAV timestamp strings (fix #683).
* Fix compiler warnings (fix #707).
  • Loading branch information
gorcha authored Feb 22, 2023
1 parent 53dd647 commit 196e8eb
Show file tree
Hide file tree
Showing 18 changed files with 172 additions and 122 deletions.
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# haven (development version)

* Updated to ReadStat 1.1.9.

* Fix various SAS catalog file reading bugs (#529, #653, #680, #696, #705).
* Increase maximum SAS page file size to 16MiB (#697).
* Ignore invalid SAV timestamp strings (#683).
* Fix compiler warnings (#707).

* Fixed issue in `write_*()` functions where invisible return of input data
frame included unintended alteration of date/time variables. (@jmobrien, #702)

Expand Down
14 changes: 14 additions & 0 deletions src/readstat/NEWS
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
New in 1.1.9:

* SAV reader: Improved support for Asian code pages #263
* SAV reader: Improved support for Very Long String records #287
* SAS reader: Improved support for RLE decompression #245 #253
* SAS reader: Support 16MiB page sizes #286
* SAS catalog reader: Fix bugs reading big-endian files #293
* SAS catalog reader: Allow formats with no labels #290
* SAS catalog reader: Check for long names in 64-bit files #291
* Improved compatibility with -Wstrict-prototypes #295
* Replace sprintf with snprintf #292

New in 1.1.8:

* XPT reader/writer: Improved support for format strings #257 #258
* DTA writer: Fix off-by-one error in v,o indexing for string refs #270
* SAV/DTA writers: Improved checking of non-ASCII characters #256
* SAS7BDAT reader: Fix use-after-free error #273
* Build: Link to libm on GNU systems #255
* SAS commands: Support more syntax
* SPSS commands: Make file names optional
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/readstat_bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#undef READSTAT_MACHINE_IS_TWOS_COMPLEMENT
#define READSTAT_MACHINE_IS_TWOS_COMPLEMENT 0

int machine_is_little_endian();
int machine_is_little_endian(void);

char ones_to_twos_complement1(char num);
int16_t ones_to_twos_complement2(int16_t num);
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/readstat_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, size_t src_len, iconv_t converter) {
/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
while (src_len && (src[src_len-1] == ' ' || src[src_len-1] == '\0')) {
src_len--;
}
if (dst_len == 0) {
Expand Down
10 changes: 6 additions & 4 deletions src/readstat/readstat_malloc.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include <stdlib.h>

#define MAX_MALLOC_SIZE 0xFFF000
/* ~16 MB. Needs to be at least 0x3FF00, i.e. the default ~4MB block size used
* in compressed SPSS (ZSAV) files. The purpose here is to prevent massive
* allocations in the event of a malformed file or a bug in the library. */
#define MAX_MALLOC_SIZE 0x1000000
/* =16 MiB. Needs to be at least 0x3FF00, i.e. the default ~4MB block size used
* in compressed SPSS (ZSAV) files. Some SAS installations use 16MiB page sizes
* by default, see https://github.com/tidyverse/haven/issues/697.
* The purpose here is to prevent massive allocations in the event of a
* malformed file or a bug in the library. */

void *readstat_malloc(size_t len) {
if (len > MAX_MALLOC_SIZE || len == 0) {
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/readstat_variable.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <stdlib.h>
#include "readstat.h"

static readstat_value_t make_blank_value();
static readstat_value_t make_blank_value(void);
static readstat_value_t make_double_value(double dval);

static readstat_value_t make_blank_value() {
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/sas/ieee.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ static void ieee2xpt(unsigned char *ieee, unsigned char *xport);

#ifndef FLOATREP
#define FLOATREP get_native()
int get_native();
int get_native(void);
#endif

void memreverse(void *intp_void, int l) {
Expand Down
33 changes: 19 additions & 14 deletions src/readstat/sas/readstat_sas7bcat_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static readstat_error_t sas7bcat_parse_value_labels(const char *value_start, siz

/* Pass 1 -- find out the offset of the labels */
for (i=0; i<label_count_capacity; i++) {
if (&lbp1[3] - value_start > value_labels_len || lbp1[2] < 0) {
if (&lbp1[3] - value_start > value_labels_len || sas_read2(&lbp1[2], ctx->bswap) < 0) {
retval = READSTAT_ERROR_PARSE;
goto cleanup;
}
Expand All @@ -76,7 +76,7 @@ static readstat_error_t sas7bcat_parse_value_labels(const char *value_start, siz
}
value_offset[label_pos] = lbp1 - value_start;
}
lbp1 += 6 + lbp1[2];
lbp1 += 6 + sas_read2(&lbp1[2], ctx->bswap);
}

const char *lbp2 = lbp1;
Expand All @@ -93,7 +93,7 @@ static readstat_error_t sas7bcat_parse_value_labels(const char *value_start, siz
readstat_value_t value = { .type = is_string ? READSTAT_TYPE_STRING : READSTAT_TYPE_DOUBLE };
char string_val[4*16+1];
if (is_string) {
size_t value_entry_len = 6 + lbp1[2];
size_t value_entry_len = 6 + sas_read2(&lbp1[2], ctx->bswap);
retval = readstat_convert(string_val, sizeof(string_val),
&lbp1[value_entry_len-16], 16, ctx->converter);
if (retval != READSTAT_OK)
Expand Down Expand Up @@ -143,18 +143,20 @@ static readstat_error_t sas7bcat_parse_block(const char *data, size_t data_size,
readstat_error_t retval = READSTAT_OK;

size_t pad = 0;
int label_count_capacity = 0;
int label_count_used = 0;
uint64_t label_count_capacity = 0;
uint64_t label_count_used = 0;
int payload_offset = 106;
uint16_t flags = 0;
char name[4*32+1];

if (data_size < payload_offset)
goto cleanup;

pad = (data[2] & 0x08) ? 4 : 0; // might be 0x10, not sure
flags = sas_read2(&data[2], ctx->bswap);
pad = (flags & 0x08) ? 4 : 0; // might be 0x10, not sure
if (ctx->u64) {
label_count_capacity = sas_read4(&data[42+pad], ctx->bswap);
label_count_used = sas_read4(&data[50+pad], ctx->bswap);
label_count_capacity = sas_read8(&data[42+pad], ctx->bswap);
label_count_used = sas_read8(&data[50+pad], ctx->bswap);

payload_offset += 32;
} else {
Expand All @@ -169,7 +171,7 @@ static readstat_error_t sas7bcat_parse_block(const char *data, size_t data_size,
pad += 16;
}

if ((data[2] & 0x80) && !ctx->u64) { // has long name
if (((flags & 0x80) && !ctx->u64) || ((flags & 0x20) && ctx->u64)) { // has long name
if (data_size < payload_offset + pad + 32)
goto cleanup;

Expand All @@ -182,6 +184,9 @@ static readstat_error_t sas7bcat_parse_block(const char *data, size_t data_size,
if (data_size < payload_offset + pad)
goto cleanup;

if (label_count_used == 0)
goto cleanup;

if ((retval = sas7bcat_parse_value_labels(&data[payload_offset+pad], data_size - payload_offset - pad,
label_count_used, label_count_capacity, name, ctx)) != READSTAT_OK)
goto cleanup;
Expand All @@ -200,15 +205,15 @@ static readstat_error_t sas7bcat_augment_index(const char *index, size_t len, sa
break;

if (xlsr[ctx->xlsr_O_offset] == 'O') {
uint32_t page = 0, pos = 0;
uint64_t page = 0, pos = 0;
if (ctx->u64) {
page = sas_read4(&xlsr[8], ctx->bswap);
pos = sas_read4(&xlsr[16], ctx->bswap);
page = sas_read8(&xlsr[8], ctx->bswap);
pos = sas_read2(&xlsr[16], ctx->bswap);
} else {
page = sas_read2(&xlsr[4], ctx->bswap);
page = sas_read4(&xlsr[4], ctx->bswap);
pos = sas_read2(&xlsr[8], ctx->bswap);
}
ctx->block_pointers[ctx->block_pointers_used++] = ((uint64_t)page << 32) + pos;
ctx->block_pointers[ctx->block_pointers_used++] = (page << 32) + pos;
}

if (ctx->block_pointers_used == ctx->block_pointers_capacity) {
Expand Down
3 changes: 2 additions & 1 deletion src/readstat/sas/readstat_sas7bdat_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,6 @@ static readstat_variable_t *sas7bdat_init_variable(sas7bdat_ctx_t *ctx, int i,

cleanup:
if (retval != READSTAT_OK) {
free(variable);
if (out_retval)
*out_retval = retval;

Expand All @@ -716,6 +715,8 @@ static readstat_variable_t *sas7bdat_init_variable(sas7bdat_ctx_t *ctx, int i,
}
}

free(variable);

return NULL;
}

Expand Down
7 changes: 7 additions & 0 deletions src/readstat/sas/readstat_sas_rle.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ typedef SSIZE_T ssize_t;
#include "readstat_sas_rle.h"

#define SAS_RLE_COMMAND_COPY64 0
#define SAS_RLE_COMMAND_COPY64_PLUS_4096 1
#define SAS_RLE_COMMAND_COPY96 2
#define SAS_RLE_COMMAND_INSERT_BYTE18 4
#define SAS_RLE_COMMAND_INSERT_AT17 5
#define SAS_RLE_COMMAND_INSERT_BLANK17 6
Expand All @@ -29,6 +31,7 @@ typedef SSIZE_T ssize_t;

static size_t command_lengths[16] = {
[SAS_RLE_COMMAND_COPY64] = 1,
[SAS_RLE_COMMAND_COPY64_PLUS_4096] = 1,
[SAS_RLE_COMMAND_INSERT_BYTE18] = 2,
[SAS_RLE_COMMAND_INSERT_AT17] = 1,
[SAS_RLE_COMMAND_INSERT_BLANK17] = 1,
Expand Down Expand Up @@ -62,6 +65,10 @@ ssize_t sas_rle_decompress(void *output_buf, size_t output_len,
case SAS_RLE_COMMAND_COPY64:
copy_len = (*input++) + 64 + length * 256;
break;
case SAS_RLE_COMMAND_COPY64_PLUS_4096:
copy_len = (*input++) + 64 + length * 256 + 4096;
break;
case SAS_RLE_COMMAND_COPY96: copy_len = length + 96; break;
case SAS_RLE_COMMAND_INSERT_BYTE18:
insert_len = (*input++) + 18 + length * 256;
insert_byte = *input++;
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/spss/readstat_por.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ ssize_t por_utf8_encode(const unsigned char *input, size_t input_len,
}
/* TODO - For some reason that replacement character isn't recognized
* by some systems, so be prepared to insert an ASCII space instead */
int printed = sprintf(output + offset, "%lc", codepoint);
int printed = snprintf(output + offset, output_len - offset, "%lc", codepoint);
if (printed > 0) {
offset += printed;
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/readstat/spss/readstat_por.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ typedef struct por_ctx_s {
ck_hash_table_t *var_dict;
} por_ctx_t;

por_ctx_t *por_ctx_init();
por_ctx_t *por_ctx_init(void);
void por_ctx_free(por_ctx_t *ctx);
ssize_t por_utf8_encode(const unsigned char *input, size_t input_len,
char *output, size_t output_len, uint16_t lookup[256]);
Expand Down
Loading

0 comments on commit 196e8eb

Please sign in to comment.