Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Account for leap year bug; fixes #264 #292

Merged
merged 2 commits into from
Mar 13, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# readxl 0.1.1.9000

* The [Lotus 1-2-3 leap year bug](https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year) is accounted for. Date-times prior to March 1, 1900 import correctly. Date-times on the non-existent leap day February 29, 1900 import as NA and throw a warning. (#264, #148, #292 @jennybc)

* Selective column type guessing: `col_types` now accepts `"guess"` to allow user to specify some column types, while allowing others to be guessed (#286 @jennybc)

* Numeric data that appears in a `"date"` column is coerced to a date. Also throws a warning. (#277, #266 @jennybc)
Expand Down
4 changes: 2 additions & 2 deletions src/XlsCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ class XlsCell {
}
}

double asDate(int offset) const {
double asDate(bool is1904) const {
switch(type_) {

case CELL_UNKNOWN:
Expand All @@ -288,7 +288,7 @@ class XlsCell {

case CELL_DATE:
case CELL_NUMERIC:
return dateRound((cell_->d - offset) * 86400);
return POSIXctFromSerial(cell_->d, is1904);
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/XlsWorkBook.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class XlsWorkBook {

// common to Xls[x]WorkBook
std::string path_;
double offset_;
bool is1904_;
std::set<int> dateStyles_;

// kept as data + accessor in XlsWorkBook vs. member function in XlsxWorkBook
Expand All @@ -39,7 +39,7 @@ class XlsWorkBook {
sheets_[i] = Rf_mkCharCE((char*) pWB_->sheets.sheet[i].name, CE_UTF8);
}

offset_ = dateOffset(pWB_->is1904);
is1904_ = pWB_->is1904;

int n_formats = pWB_->formats.count;
for (int i = 0; i < n_formats; ++i) {
Expand All @@ -65,8 +65,8 @@ class XlsWorkBook {
return sheets_;
}

double offset() const {
return offset_;
bool is1904() const {
return is1904_;
}

const std::set<int>& dateStyles() const {
Expand Down
4 changes: 2 additions & 2 deletions src/XlsWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ class XlsWorkSheet {
i + 1, j + 1,
xcell->asStdString());
}
REAL(col)[row] = xcell->asDate(wb_.offset());
REAL(col)[row] = xcell->asDate(wb_.is1904());
break;

case COL_NUMERIC:
Expand Down Expand Up @@ -261,7 +261,7 @@ class XlsWorkSheet {
SET_VECTOR_ELT(col, row, Rf_ScalarLogical(xcell->asInteger()));
break;
case CELL_DATE: {
Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.offset()));
Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.is1904()));
cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt");
cell_val.attr("tzone") = "UTC";
SET_VECTOR_ELT(col, row, cell_val);
Expand Down
4 changes: 2 additions & 2 deletions src/XlsxCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class XlsxCell {
}
}

double asDate(int offset) const {
double asDate(bool is1904) const {
switch(type_) {

case CELL_UNKNOWN:
Expand All @@ -268,7 +268,7 @@ class XlsxCell {
case CELL_NUMERIC:
{
rapidxml::xml_node<>* v = cell_->first_node("v");
return dateRound((atof(v->value()) - offset) * 86400);
return POSIXctFromSerial(atof(v->value()), is1904);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/XlsxWorkBook.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class XlsxWorkBook {

// common to Xls[x]WorkBook
std::string path_;
double offset_;
bool is1904_;
std::set<int> dateStyles_;

// specific to XlsxWorkBook
Expand All @@ -121,7 +121,7 @@ class XlsxWorkBook {
path_(path),
rel_(path)
{
offset_ = dateOffset(is1904());
is1904_ = uses1904();
cacheStringTable();
cacheDateStyles();
}
Expand All @@ -138,8 +138,8 @@ class XlsxWorkBook {
return rel_.names();
}

double offset() const {
return offset_;
bool is1904() const {
return is1904_;
}

const std::set<int>& dateStyles() const {
Expand Down Expand Up @@ -229,7 +229,7 @@ class XlsxWorkBook {
}
}

bool is1904() {
bool uses1904() {
std::string workbookXml = zip_buffer(path_, "xl/workbook.xml");
rapidxml::xml_document<> workbook;
workbook.parse<0>(&workbookXml[0]);
Expand Down
4 changes: 2 additions & 2 deletions src/XlsxWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ class XlsxWorkSheet {
Rcpp::warning("Expecting date in [%i, %i]: got '%s'",
i + 1, j + 1, xcell->asStdString(wb_.stringTable()));
}
REAL(col)[row] = xcell->asDate(wb_.offset());
REAL(col)[row] = xcell->asDate(wb_.is1904());
break;

case COL_NUMERIC:
Expand Down Expand Up @@ -278,7 +278,7 @@ class XlsxWorkSheet {
}
case CELL_DATE: {
Rcpp::RObject cell_val =
Rf_ScalarReal(xcell->asDate(wb_.offset()));
Rf_ScalarReal(xcell->asDate(wb_.is1904()));
cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt");
cell_val.attr("tzone") = "UTC";
SET_VECTOR_ELT(col, row, cell_val);
Expand Down
64 changes: 57 additions & 7 deletions src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,35 @@
#include <cerrno>
#include "StringSet.h"

// The date offset needed to align Excel dates with R's use of 1970-01-01
// depends on the "date system".
//
// xls ------------------------------------------------------------------------
// Page and section numbers below refer to
// [MS-XLS]: Excel Binary File Format (.xls) Structure
// version, date, and download URL given in XlsCell.h
//
// 2.4.77 Date1904 p257 ... it boils down to a boolean
// 0 --> 1900 date system
// 1 --> 1904 date system
//
// xlsx ------------------------------------------------------------------------
// Page and section numbers below refer to
// ECMA-376
// version, date, and download URL given in XlsxCell.h
//
// 18.2.28 workbookPr (Workbook Properties) p1582
// date1904:
// Value that indicates whether to use a 1900 or 1904 date system when
// converting serial date-times in the workbook to dates.
// A value of 1 or true indicates the workbook uses the 1904 date system.
// A value of 0 or false indicates the workbook uses the 1900 date system. (See
// 18.17.4.1 for the definition of the date systems.)
// The default value for this attribute is false.
// in xl/workbook.xml, node workbook, child node workbookPr
// attribute date1904:
// 0 or false --> 1900 date system
// 1 or true --> 1904 date system (this is the default)
//
// 18.17.4.1 p2067 holds definition of the date systems
//
// Date systems ---------------------------------------------------------------
// 1900 system: first possible date is 1900-01-01 00:00:00,
// which has serial value of **1**
// 1904 system: origin 1904-01-01 00:00:00
inline double dateOffset(bool is1904) {
// as.numeric(as.Date("1899-12-30"))
// as.numeric(as.Date("1904-01-01"))
Expand All @@ -29,6 +50,35 @@ inline double dateRound(double dttm) {
return ms / 10000;
}

// this is even more horrible
// correct for Excel's faithful re-implementation of the Lotus 1-2-3 bug,
// in which February 29, 1900 is included in the date system, even though 1900
// was not actually a leap year
// https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year
// How we address this:
// If date is *prior* to the non-existent leap day: add a day
// If date is on the non-existent leap day: make negative and, in due course, NA
// Otherwise: do nothing
inline double removeLeapDay(double xlDate) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would be slightly easier to follow if you combined these two functions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes good point. Done 2d57d7a

if (xlDate >= 61) {
return xlDate;
} else {
return (xlDate < 60) ? ++xlDate : -1;
}
}

inline double POSIXctFromSerial(double xlDate, bool is1904) {
if (!is1904) {
xlDate = removeLeapDay(xlDate);
}
if (xlDate < 0) {
Rcpp::warning("NA inserted for impossible 1900-02-29 datetime");
return NA_REAL;
} else {
return dateRound((xlDate - dateOffset(is1904)) * 86400);
}
}

// Simple parser: does not check that order of numbers and letters is correct
inline std::pair<int, int> parseRef(const char* ref) {
int col = 0, row = 0;
Expand Down
Binary file not shown.
Binary file not shown.
26 changes: 26 additions & 0 deletions tests/testthat/test-dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,29 @@ test_that("date subsecond rounding works", {
df <- read_excel(test_sheet("datetime-rounding.xls"))
expect_identical(as.character(df$dttm), df$dttm_string)
})

## Lotus 1-2-3 leap year bug
## #264, #148
test_that("we get correct dates prior to March 1, 1900, in 1900 date system", {
## xlsx
expect_warning(
df <- read_excel(test_sheet("dates-leap-year-1900-xlsx.xlsx"),
col_types = c("date", "text", "logical")),
"NA inserted for impossible 1900-02-29 datetime"
)
dttms <- as.POSIXct(df$dttm_string, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
leap_day <- df$dttm_string == "1900-02-29 08:00:00"
expect_identical(df$dttm[!leap_day], dttms[!leap_day])
expect_true(is.na(df$dttm[leap_day]))

## xls
expect_warning(
df <- read_excel(test_sheet("dates-leap-year-1900-xls.xls"),
col_types = c("date", "text", "logical")),
"NA inserted for impossible 1900-02-29 datetime"
)
dttms <- as.POSIXct(df$dttm_string, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
leap_day <- df$dttm_string == "1900-02-29 08:00:00"
expect_identical(df$dttm[!leap_day], dttms[!leap_day])
expect_true(is.na(df$dttm[leap_day]))
})