diff --git a/downloads/stata-13-test-files/Stata14TestFile.dta b/downloads/stata-13-test-files/Stata14TestFile.dta new file mode 100644 index 00000000000..6f1c31dc798 Binary files /dev/null and b/downloads/stata-13-test-files/Stata14TestFile.dta differ diff --git a/scripts/search/data/tabular/open-source-at-harvard118.dta b/scripts/search/data/tabular/open-source-at-harvard118.dta new file mode 100644 index 00000000000..864030ae1a0 Binary files /dev/null and b/scripts/search/data/tabular/open-source-at-harvard118.dta differ diff --git a/scripts/search/data/tabular/stata13-auto-withstrls.dta b/scripts/search/data/tabular/stata13-auto-withstrls.dta new file mode 100644 index 00000000000..5ebc30298f4 Binary files /dev/null and b/scripts/search/data/tabular/stata13-auto-withstrls.dta differ diff --git a/scripts/search/data/tabular/stata13-auto.dta b/scripts/search/data/tabular/stata13-auto.dta new file mode 100644 index 00000000000..c369b4e9f79 Binary files /dev/null and b/scripts/search/data/tabular/stata13-auto.dta differ diff --git a/scripts/search/data/tabular/stata14-auto-withstrls.dta b/scripts/search/data/tabular/stata14-auto-withstrls.dta new file mode 100644 index 00000000000..6119b02065e Binary files /dev/null and b/scripts/search/data/tabular/stata14-auto-withstrls.dta differ diff --git a/src/main/java/MimeTypeDisplay.properties b/src/main/java/MimeTypeDisplay.properties index d0d52541a74..a658df0f7ba 100644 --- a/src/main/java/MimeTypeDisplay.properties +++ b/src/main/java/MimeTypeDisplay.properties @@ -18,6 +18,8 @@ application/x-R-2=R Binary application/x-stata=Stata Binary application/x-stata-6=Stata Binary application/x-stata-13=Stata 13 Binary +application/x-stata-14=Stata 14 Binary +application/x-stata-14=Stata 15 Binary text/x-stata-syntax=Stata Syntax application/x-spss-por=SPSS Portable application/x-spss-sav=SPSS SAV diff --git a/src/main/java/MimeTypeFacets.properties b/src/main/java/MimeTypeFacets.properties index 0044def302c..11444fb9813 100644 --- a/src/main/java/MimeTypeFacets.properties +++ b/src/main/java/MimeTypeFacets.properties @@ -21,6 +21,8 @@ application/x-R-2=data application/x-stata=data application/x-stata-6=data application/x-stata-13=data +application/x-stata-14=data +application/x-stata-15=data text/x-stata-syntax=data application/x-spss-por=data application/x-spss-sav=data diff --git a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java b/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java index 76b6ae9aa18..15c3b34f6af 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java @@ -8,6 +8,7 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DataTable; +import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.ingest.IngestServiceBean; @@ -101,7 +102,7 @@ public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fil try { tabDataIngest = ingestPlugin.read(fileInputStream, null); } catch (IOException ingestEx) { - output = output.concat("Caught an exception trying to ingest file "+fileName+"."); + output = output.concat("Caught an exception trying to ingest file " + fileName + ": " + ingestEx.getLocalizedMessage()); return output; } @@ -121,6 +122,8 @@ public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fil DataFile dataFile = new DataFile(); dataFile.setStorageIdentifier(tabFilename); + Dataset dataset = new Dataset(); + dataFile.setOwner(dataset); FileMetadata fileMetadata = new FileMetadata(); fileMetadata.setLabel(fileName); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java index f5ea17de568..e91220a1a6e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java @@ -106,7 +106,7 @@ private static String generateOriginalExtension(String fileType) { return ".sav"; } else if (fileType.equalsIgnoreCase("application/x-spss-por")) { return ".por"; - } else if (fileType.equalsIgnoreCase("application/x-stata") || fileType.equalsIgnoreCase("application/x-stata-13")) { + } else if (fileType.equalsIgnoreCase("application/x-stata") || fileType.equalsIgnoreCase("application/x-stata-13") || fileType.equalsIgnoreCase("application/x-stata-14") || fileType.equalsIgnoreCase("application/x-stata-15")) { return ".dta"; } else if (fileType.equalsIgnoreCase("application/x-dvn-csvspss-zip")) { return ".zip"; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestMessageBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestMessageBean.java index c9886dcab13..811b142d6c0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestMessageBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestMessageBean.java @@ -80,7 +80,7 @@ public void onMessage(Message message) { //Thread.sleep(10000); logger.fine("Finished ingest job;"); } else { - logger.warning("Error occurred during ingest job!"); + logger.warning("Error occurred during ingest job for file id " + datafile_id + "!"); } } catch (Exception ex) { //ex.printStackTrace(); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index debc7ccf044..5d0a7f52be1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -55,7 +55,7 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReader; -import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTA117FileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.NewDTAFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReaderSpi; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata.RDATAFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata.RDATAFileReaderSpi; @@ -548,9 +548,6 @@ public void produceContinuousSummaryStatistics(DataFile dataFile, File generated if (dataFile.getDataTable().getDataVariables().get(i).isIntervalContinuous()) { logger.fine("subsetting continuous vector"); - StorageIO storageIO = dataFile.getStorageIO(); - storageIO.open(); - if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) { Float[] variableVector = TabularSubsetGenerator.subsetFloatVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); logger.fine("Calculating summary statistics on a Float vector;"); @@ -582,9 +579,6 @@ public void produceDiscreteNumericSummaryStatistics(DataFile dataFile, File gene && dataFile.getDataTable().getDataVariables().get(i).isTypeNumeric()) { logger.fine("subsetting discrete-numeric vector"); - StorageIO storageIO = dataFile.getStorageIO(); - storageIO.open(); - Long[] variableVector = TabularSubsetGenerator.subsetLongVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); // We are discussing calculating the same summary stats for // all numerics (the same kind of sumstats that we've been calculating @@ -618,9 +612,6 @@ public void produceCharacterSummaryStatistics(DataFile dataFile, File generatedT for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) { if (dataFile.getDataTable().getDataVariables().get(i).isTypeCharacter()) { - StorageIO storageIO = dataFile.getStorageIO(); - storageIO.open(); - logger.fine("subsetting character vector"); String[] variableVector = TabularSubsetGenerator.subsetStringVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue()); //calculateCharacterSummaryStatistics(dataFile, i, variableVector); @@ -678,6 +669,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I // it up with the Ingest Service Provider Registry: String fileName = dataFile.getFileMetadata().getLabel(); TabularDataFileReader ingestPlugin = getTabDataReaderByMimeType(dataFile.getContentType()); + logger.fine("Using ingest plugin " + ingestPlugin.getClass()); if (ingestPlugin == null) { dataFile.SetIngestProblem(); @@ -742,7 +734,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I dataFile = fileService.save(dataFile); dataFile = fileService.save(dataFile); - logger.fine("Ingest failure (IO Exception): "+ingestEx.getMessage()+ "."); + logger.warning("Ingest failure (IO Exception): " + ingestEx.getMessage() + "."); return false; } catch (Exception unknownEx) { // this is a bit of a kludge, to make sure no unknown exceptions are @@ -804,6 +796,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I } if (!postIngestTasksSuccessful) { + logger.warning("Ingest failure (!postIngestTasksSuccessful)."); return false; } @@ -850,6 +843,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I } if (!databaseSaveSuccessful) { + logger.warning("Ingest failure (!databaseSaveSuccessful)."); return false; } @@ -900,6 +894,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I logger.warning("Ingest failed to produce data obect."); } + logger.fine("Returning ingestSuccessful: " + ingestSuccessful); return ingestSuccessful; } @@ -952,7 +947,11 @@ public static TabularDataFileReader getTabDataReaderByMimeType(String mimeType) if (mimeType.equals(FileUtil.MIME_TYPE_STATA)) { ingestPlugin = new DTAFileReader(new DTAFileReaderSpi()); } else if (mimeType.equals(FileUtil.MIME_TYPE_STATA13)) { - ingestPlugin = new DTA117FileReader(new DTAFileReaderSpi()); + ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 117); + } else if (mimeType.equals(FileUtil.MIME_TYPE_STATA14)) { + ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 118); + } else if (mimeType.equals(FileUtil.MIME_TYPE_STATA15)) { + ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 119); } else if (mimeType.equals(FileUtil.MIME_TYPE_RDATA)) { ingestPlugin = new RDATAFileReader(new RDATAFileReaderSpi()); } else if (mimeType.equals(FileUtil.MIME_TYPE_CSV) || mimeType.equals(FileUtil.MIME_TYPE_CSV_ALT)) { diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestableDataChecker.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestableDataChecker.java index 37279e6bf75..44b1fa803d2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestableDataChecker.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestableDataChecker.java @@ -54,6 +54,8 @@ public class IngestableDataChecker implements java.io.Serializable { // Map that returns a Stata Release number private static Map stataReleaseNumber = new HashMap(); public static String STATA_13_HEADER = "
117"; + public static String STATA_14_HEADER = "
118"; + public static String STATA_15_HEADER = "
119"; // Map that returns a reader-implemented mime-type private static Set readableFileTypes = new HashSet(); private static Map testMethods = new HashMap(); @@ -91,6 +93,8 @@ public class IngestableDataChecker implements java.io.Serializable { readableFileTypes.add("application/x-spss-por"); readableFileTypes.add("application/x-rlang-transport"); readableFileTypes.add("application/x-stata-13"); + readableFileTypes.add("application/x-stata-14"); + readableFileTypes.add("application/x-stata-15"); Pattern p = Pattern.compile(regex); ptn = Pattern.compile(rdargx); @@ -259,7 +263,45 @@ public String testDTAformat(MappedByteBuffer buff) { } } - + + if ((result == null) && (buff.capacity() >= STATA_14_HEADER.length())) { + // Let's see if it's a "new" STATA (v.14+) format: + buff.rewind(); + byte[] headerBuffer = null; + String headerString = null; + try { + headerBuffer = new byte[STATA_14_HEADER.length()]; + buff.get(headerBuffer, 0, STATA_14_HEADER.length()); + headerString = new String(headerBuffer, "US-ASCII"); + } catch (Exception ex) { + // probably a buffer underflow exception; + // we don't have to do anything... null will + // be returned, below. + } + if (STATA_14_HEADER.equals(headerString)) { + result = "application/x-stata-14"; + } + } + + if ((result == null) && (buff.capacity() >= STATA_15_HEADER.length())) { + // Let's see if it's a "new" STATA (v.14+) format: + buff.rewind(); + byte[] headerBuffer = null; + String headerString = null; + try { + headerBuffer = new byte[STATA_15_HEADER.length()]; + buff.get(headerBuffer, 0, STATA_15_HEADER.length()); + headerString = new String(headerBuffer, "US-ASCII"); + } catch (Exception ex) { + // probably a buffer underflow exception; + // we don't have to do anything... null will + // be returned, below. + } + if (STATA_15_HEADER.equals(headerString)) { + result = "application/x-stata-15"; + } + } + return result; } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReader.java deleted file mode 100644 index f2ba8674389..00000000000 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTA117FileReader.java +++ /dev/null @@ -1,2382 +0,0 @@ -/* - Copyright (C) 2005-2012, by the President and Fellows of Harvard College. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - Dataverse Network - A web application to share, preserve and analyze research data. - Developed at the Institute for Quantitative Social Science, Harvard University. - Version 3.0. -*/ - -package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; - - - -import java.io.*; -import java.nio.*; -import java.util.logging.*; - -import java.util.*; -import java.util.regex.*; -import java.text.*; - - -import org.apache.commons.lang.*; -import javax.inject.Inject; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import edu.harvard.iq.dataverse.DataTable; -import edu.harvard.iq.dataverse.datavariable.DataVariable; -import edu.harvard.iq.dataverse.datavariable.SummaryStatistic; -import edu.harvard.iq.dataverse.datavariable.VariableCategory; -//import edu.harvard.iq.dataverse.datavariable.VariableFormatType; -import edu.harvard.iq.dataverse.datavariable.VariableRange; -//import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; - -import edu.harvard.iq.dataverse.ingest.plugin.spi.*; -import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; -import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; -import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; - - -/** - * ingest plugin for Stata 13 (117) DTA file format. - * - * This ingest plugin has been written from scratch for the DVN 4.0, - * since this file format, introduced in STATA 13 is a brand new - * development, independent of and incompatible with the old, "classic" - * dta format. - * - * For the format documentation, see http://www.stata.com/help.cgi?dta - * @author Leonid Andreev - */ - -public class DTA117FileReader extends TabularDataFileReader{ - //@Inject - //VariableServiceBean varService; - // static fields, STATA-specific constants, etc. - - // SECTION TAGS: - // - // The new STATA format features XML-like section tags - - //
...
...
- - // MAIN, TOP-LEVEL FILE SECTION: - - private static final String TAG_DTA_117 = "stata_dta"; - - // HEADER SECTION: - - private static final String TAG_HEADER = "header"; - private static final String TAG_HEADER_FILEFORMATID = "release"; - private static final String TAG_HEADER_BYTEORDER = "byteorder"; - private static final String TAG_HEADER_VARNUMBER = "K"; - private static final String TAG_HEADER_OBSNUMBER = "N"; - private static final String TAG_HEADER_FILELABEL = "label"; - private static final String TAG_HEADER_TIMESTAMP = "timestamp"; - - // MAP SECTION: - - private static final String TAG_MAP = "map"; - - // VARIABLE TYPES SECTION: - - private static final String TAG_VARIABLE_TYPES = "variable_types"; - - // VARIABLE NAMES SECTION: - - private static final String TAG_VARIABLE_NAMES = "varnames"; - - // VARIABLE SORT ORDER SECTION: - - private static final String TAG_SORT_ORDER = "sortlist"; - - // VARIABLE DISPLAY FORMATS: - - private static final String TAG_DISPLAY_FORMATS = "formats"; - - // VALUE LABEL FORMAT NAMES: - // (TODO: add a comment) - - private static final String TAG_VALUE_LABEL_FORMAT_NAMES = "value_label_names"; - - // VARIABLE LABELS: - - private static final String TAG_VARIABLE_LABELS = "variable_labels"; - - // "CHARACTERISTICS": - - private static final String TAG_CHARACTERISTICS = "characteristics"; - private static final String TAG_CHARACTERISTICS_SUBSECTION = "ch"; - - // DATA SECTION! - - private static final String TAG_DATA = "data"; - - // STRLs SECTION: - - private static final String TAG_STRLS = "strls"; - private static final String STRL_GSO_HEAD = "GSO"; - - // VALUE LABELS SECTION: - - private static final String TAG_VALUE_LABELS = "value_labels"; - private static final String TAG_VALUE_LABELS_LBL_DEF = "lbl"; - - // (TODO: should the constants below be isolated in some other class, that - // could be shared between the 2 STATA DTA reader plugins? - - private static Map STATA_RELEASE_NUMBER = - new HashMap(); - - private static Map> CONSTANT_TABLE = - new LinkedHashMap>(); - - private static Map release117constant = - new LinkedHashMap(); - - - private static Map byteLengthTable117 = - new HashMap(); - - private static Map variableTypeTable117 = - new LinkedHashMap(); - - private static final int[] LENGTH_HEADER = {60, 109}; - private static final int[] LENGTH_LABEL = {32, 81}; - private static final int[] LENGTH_NAME = {9, 33}; - private static final int[] LENGTH_FORMAT_FIELD = {7, 12, 49}; - private static final int[] LENGTH_EXPANSION_FIELD ={0, 2, 4}; - private static final int[] DBL_MV_PWR = {333, 1023}; - - private static final int DTA_MAGIC_NUMBER_LENGTH = 4; - private static final int NVAR_FIELD_LENGTH = 2; - private static final int NOBS_FIELD_LENGTH = 4; - private static final int TIME_STAMP_LENGTH = 18; - private static final int VAR_SORT_FIELD_LENGTH = 2; - private static final int VALUE_LABEL_HEADER_PADDING_LENGTH = 3; - - private static int MISSING_VALUE_BIAS = 26; - - private byte BYTE_MISSING_VALUE = Byte.MAX_VALUE; - private short INT_MISSIG_VALUE = Short.MAX_VALUE; - private int LONG_MISSING_VALUE = Integer.MAX_VALUE; - - // Static initialization: - - static { - - STATA_RELEASE_NUMBER.put(117, "v.13"); - - release117constant.put("HEADER", LENGTH_HEADER[1]); - release117constant.put("LABEL", LENGTH_LABEL[1]); - release117constant.put("NAME", LENGTH_NAME[1]); - release117constant.put("FORMAT", LENGTH_FORMAT_FIELD[1]); - release117constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]); - release117constant.put("DBL_MV_PWR",DBL_MV_PWR[1]); - - CONSTANT_TABLE.put(117, release117constant); - - // 1, 2 and 4-byte integers: - byteLengthTable117.put("Byte",1); - byteLengthTable117.put("Integer",2); - byteLengthTable117.put("Long",4); - // 4 and 8-byte floats: - byteLengthTable117.put("Float",4); - byteLengthTable117.put("Double",8); - // STRLs are defined in their own section, outside of the - // main data. In the section they are referenced - // by 2 x 4 byte values, "(v,o)", 8 bytes total. - byteLengthTable117.put("STRL",8); - - variableTypeTable117.put(65530,"Byte"); - variableTypeTable117.put(65529,"Integer"); - variableTypeTable117.put(65528,"Long"); - variableTypeTable117.put(65527,"Float"); - variableTypeTable117.put(65526,"Double"); - - //variableTypeTable117.put(32768,"STRL"); - } - - private static String[] MIME_TYPE = { - "application/x-stata", - "application/x-stata-13" - }; - - - private static String unfVersionNumber = "6"; - - private static final List FLOAT_MISSING_VALUES = Arrays.asList( - 0x1.000p127f, 0x1.001p127f, 0x1.002p127f, 0x1.003p127f, - 0x1.004p127f, 0x1.005p127f, 0x1.006p127f, 0x1.007p127f, - 0x1.008p127f, 0x1.009p127f, 0x1.00ap127f, 0x1.00bp127f, - 0x1.00cp127f, 0x1.00dp127f, 0x1.00ep127f, 0x1.00fp127f, - 0x1.010p127f, 0x1.011p127f, 0x1.012p127f, 0x1.013p127f, - 0x1.014p127f, 0x1.015p127f, 0x1.016p127f, 0x1.017p127f, - 0x1.018p127f, 0x1.019p127f, 0x1.01ap127f); - - private Set FLOAT_MISSING_VALUE_SET = - new HashSet(FLOAT_MISSING_VALUES); - - private static final List DOUBLE_MISSING_VALUE_LIST = Arrays.asList( - 0x1.000p1023, 0x1.001p1023, 0x1.002p1023, 0x1.003p1023, 0x1.004p1023, - 0x1.005p1023, 0x1.006p1023, 0x1.007p1023, 0x1.008p1023, 0x1.009p1023, - 0x1.00ap1023, 0x1.00bp1023, 0x1.00cp1023, 0x1.00dp1023, 0x1.00ep1023, - 0x1.00fp1023, 0x1.010p1023, 0x1.011p1023, 0x1.012p1023, 0x1.013p1023, - 0x1.014p1023, 0x1.015p1023, 0x1.016p1023, 0x1.017p1023, 0x1.018p1023, - 0x1.019p1023, 0x1.01ap1023); - - private Set DOUBLE_MISSING_VALUE_SET = - new HashSet(DOUBLE_MISSING_VALUE_LIST); - - private static SimpleDateFormat sdf_ymdhmsS = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); // sdf - - - private static SimpleDateFormat sdf_ymd = new SimpleDateFormat("yyyy-MM-dd"); // sdf2 - - - private static SimpleDateFormat sdf_hms = new SimpleDateFormat("HH:mm:ss"); // stf - - - private static SimpleDateFormat sdf_yw = new SimpleDateFormat("yyyy-'W'ww"); - - - - // stata's calendar - private static Calendar GCO_STATA = new GregorianCalendar(TimeZone.getTimeZone("GMT")); - - private static String[] DATE_TIME_FORMAT= { - "%tc", "%td", "%tw", "%tq","%tm", "%th", "%ty", - "%d", "%w", "%q", "%m", "h", "%tb" - }; - // New "business calendar format" has been added in Stata 12. -- L.A. - private static String[] DATE_TIME_CATEGORY={ - "time", "date", "date", "date", "date", "date", "date", - "date", "date", "date", "date", "date", "date" - }; - private static Map DATE_TIME_FORMAT_TABLE= new LinkedHashMap(); - - private static long SECONDS_PER_YEAR = 24*60*60*1000L; // TODO: huh? - - private static long STATA_BIAS_TO_EPOCH; - - static { - - sdf_ymdhmsS.setTimeZone(TimeZone.getTimeZone("GMT")); - sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT")); - sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT")); - sdf_yw.setTimeZone(TimeZone.getTimeZone("GMT")); - - // set stata's calendar - GCO_STATA.set(1, 1960);// year - GCO_STATA.set(2, 0); // month - GCO_STATA.set(5, 1);// day of month - GCO_STATA.set(9, 0);// AM(0) or PM(1) - GCO_STATA.set(10, 0);// hh - GCO_STATA.set(12, 0);// mm - GCO_STATA.set(13, 0);// ss - GCO_STATA.set(14, 0); // SS millisecond - - - STATA_BIAS_TO_EPOCH = GCO_STATA.getTimeInMillis(); // = -315619200000 - - for (int i=0; i constantTable ; - - private Map byteLengthTable; - - private Map variableTypeTable; - - private Map cachedGSOs; - - - - private NumberFormat twoDigitFormatter = new DecimalFormat("00"); - - private NumberFormat doubleNumberFormatter = new DecimalFormat(); - - TabularDataIngest ingesteddata = new TabularDataIngest(); - - - private int releaseNumber = 117; - - private int headerLength; - - private int dataLabelLength; - - private boolean hasSTRLs = false; - - - // TODO: - // rewrite this comment? - /* variableTypes is a list of string values representing the type of - * data values *stored* in the file - "byte", "integer", "float", "string", - * etc. We need this information as we're reading the data, to know how - * many bytes to read for every object type and how to convert the binary - * data into the proper Java type. - * It's important to note that these types are *Stata* types - the types - * of the variables on the DVN side may change (see below). - * The variableTypesFinal will describe the data values once they have - * been read and stored in the tab. file. This is an important distinction: - * for example, the time/data values are stored as binary numeric values - * in Stata files, but we'll be storing them as strings in the DVN tabular - * files. - */ - - private String[] variableTypes=null; - - private String[] dateVariableFormats=null; - - private static final String MissingValueForTabDelimitedFile = ""; - - - // Constructor -----------------------------------------------------------// - - public DTA117FileReader(TabularDataFileReaderSpi originator){ - super(originator); - } - - - /* - * This method configures Stata's release-specific parameters: - */ - // TODO: this method needs to be actually called! - private void init() throws IOException { - // - logger.fine("release number=" + releaseNumber); - - variableTypeTable = variableTypeTable117; - - byteLengthTable = byteLengthTable117; - BYTE_MISSING_VALUE -= MISSING_VALUE_BIAS; - INT_MISSIG_VALUE -= MISSING_VALUE_BIAS; - LONG_MISSING_VALUE -= MISSING_VALUE_BIAS; - - constantTable = CONSTANT_TABLE.get(releaseNumber); - - headerLength = constantTable.get("HEADER") - DTA_MAGIC_NUMBER_LENGTH; - - dataLabelLength = headerLength - (NVAR_FIELD_LENGTH - + NOBS_FIELD_LENGTH + TIME_STAMP_LENGTH); - logger.fine("data_label_length=" + dataLabelLength); - - logger.fine("constant table to be used:\n" + constantTable); - - doubleNumberFormatter.setGroupingUsed(false); - doubleNumberFormatter.setMaximumFractionDigits(340); // TODO: WTF??? - - /* - * No longer necessary to use variable service bean during ingest! - * - - Context ctx = null; - - try { - ctx = new InitialContext(); - varService = (VariableServiceBean) ctx.lookup("java:global/dataverse-4.0/VariableServiceBean"); - } catch (NamingException nex) { - try { - ctx = new InitialContext(); - varService = (VariableServiceBean) ctx.lookup("java:global/dataverse/VariableServiceBean"); - } catch (NamingException nex2) { - logger.severe("Could not look up initial context, or the variable service in JNDI!"); - throw new IOException("Could not look up initial context, or the variable service in JNDI!"); - } - } - */ - } - - public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { - logger.info("DTA117FileReader: read() start"); - - // shit ton of diagnostics (still) needed here!! -- L.A. - - if (dataFile != null) { - throw new IOException ("this plugin does not support external raw data files"); - } - - DataReader dataReader = null; - - try { - init(); - - // create a new instance of DataReader: - dataReader = new DataReader(stream); - // and read the opening tag: - dataReader.readOpeningTag(TAG_DTA_117); - - // ...and if we've made this far, we can try - // and read the header section: - readHeader(dataReader); - - // then the map: - readMap(dataReader); - - // variable types: - readVariableTypes(dataReader); - - // variable names: - readVariableNames(dataReader); - - // sort order: - readSortOrder(dataReader); - - // display formats: - readDisplayFormats(dataReader); - - // value label formats: - readValueLabelFormatNames(dataReader); - - // variable labels: - readVariableLabels(dataReader); - - // "characteristics" - STATA-proprietary information - // (we are skipping it) - readCharacteristics(dataReader); - - // Data! - readData(dataReader); - - // STRLs: - // (potentially) large, (potentially) non-ASCII character strings - // saved outside the ... section, and referenced - // in the data rows using (v,o) notation - see the documentation - // for more information. - readSTRLs(dataReader); - - // finally, Value Labels: - readValueLabels(dataReader); - - // verify that we've reached the final closing tag: - dataReader.readClosingTag(TAG_DTA_117); - - ingesteddata.setDataTable(dataTable); - } catch (IllegalArgumentException iaex) { - throw new IOException(iaex.getMessage()); - } - - logger.info("DTA117FileReader: read() end."); - return ingesteddata; - } - - - - private void readHeader(DataReader dataReader) throws IOException { - logger.fine("readHeader(): start"); - - if (dataReader == null) { - throw new IllegalArgumentException("stream == null!"); - } - - logger.fine("reading the version header."); - - dataReader.readOpeningTag(TAG_HEADER); - String dtaVersionTag = dataReader.readPrimitiveStringSection(TAG_HEADER_FILEFORMATID, 3); - - if (!"117".equals(dtaVersionTag)) { - throw new IOException("Unexpected version tag found: "+dtaVersionTag+"; expected value: 117."); - } - - String byteOrderTag = dataReader.readPrimitiveStringSection(TAG_HEADER_BYTEORDER); - - logger.fine("byte order: "+byteOrderTag); - - if ("LSF".equals(byteOrderTag)) { - dataReader.setLSF(true); - } else if ("MSF".equals(byteOrderTag)) { - dataReader.setLSF(false); - } - - int varNumber = dataReader.readIntegerSection(TAG_HEADER_VARNUMBER, 2); - logger.fine("number of variables: " + varNumber); - - int obsNumber = dataReader.readIntegerSection(TAG_HEADER_OBSNUMBER, 4); - logger.fine("number of observations: " + obsNumber); - - dataTable.setVarQuantity(new Long(varNumber)); - dataTable.setCaseQuantity(new Long(obsNumber)); - - - dataTable.setOriginalFileFormat(MIME_TYPE[0]); - dataTable.setOriginalFormatVersion("STATA 13"); - dataTable.setUnf("UNF:pending"); - - // The word "dataset" below is used in its STATA parlance meaning, - // i.e., this is a label that describes the datafile. - String datasetLabel = dataReader.readDefinedStringSection(TAG_HEADER_FILELABEL, 80); - logger.fine("dataset label: "+datasetLabel); - - // TODO: - // do we want to do anything with this label? Add it to the - // filemetadata, similarly to what we do with those auto-generated - // FITS descriptive labels maybe? - // (similarly, what to do with the date stamp, below?) - // -- L.A. 4.0 beta 8 - - String datasetTimeStamp = dataReader.readDefinedStringSection(TAG_HEADER_TIMESTAMP, 17); - logger.fine("dataset time stamp: "+datasetTimeStamp); - - if (datasetTimeStamp == null || - (datasetTimeStamp.length() > 0 && datasetTimeStamp.length() < 17)) { - throw new IOException("unexpected/invalid length of the time stamp in the DTA117 header."); - } else { - // TODO: validate the time stamp found against dd Mon yyyy hh:mm; - // ...but first decide if we actually want/need to use it for any - // practical purposes... - } - - dataReader.readClosingTag("header"); - logger.fine("readHeader(): end"); - } - - /* - TODO: add a comment. --L.A. DVN 4.0 beta 8 - */ - private void readMap(DataReader reader) throws IOException { - logger.fine("Map section; at offset "+reader.getByteOffset()); - reader.readOpeningTag(TAG_MAP); - - dtaMap = new DTADataMap(); - - long dta_offset_stata_data = reader.readLongInteger(); - logger.fine("dta_offset_stata_data: " + dta_offset_stata_data); - dtaMap.setOffset_head(dta_offset_stata_data); - long dta_offset_map = reader.readLongInteger(); - logger.fine("dta_offset_map: " + dta_offset_map); - dtaMap.setOffset_map(dta_offset_map); - long dta_offset_variable_types = reader.readLongInteger(); - logger.fine("dta_offset_variable_types: " + dta_offset_variable_types); - dtaMap.setOffset_types(dta_offset_variable_types); - long dta_offset_varnames = reader.readLongInteger(); - logger.fine("dta_offset_varnames: " + dta_offset_varnames); - dtaMap.setOffset_varnames(dta_offset_varnames); - long dta_offset_sortlist = reader.readLongInteger(); - logger.fine("dta_offset_sortlist: " + dta_offset_sortlist); - dtaMap.setOffset_srtlist(dta_offset_sortlist); - long dta_offset_formats = reader.readLongInteger(); - logger.fine("dta_offset_formats: " + dta_offset_formats); - dtaMap.setOffset_fmts(dta_offset_formats); - long dta_offset_value_label_names = reader.readLongInteger(); - logger.fine("dta_offset_value_label_names: " + dta_offset_value_label_names); - dtaMap.setOffset_vlblnames(dta_offset_value_label_names); - long dta_offset_variable_labels = reader.readLongInteger(); - logger.fine("dta_offset_variable_labels: " + dta_offset_variable_labels); - dtaMap.setOffset_varlabs(dta_offset_variable_labels); - long dta_offset_characteristics = reader.readLongInteger(); - logger.fine("dta_offset_characteristics: " + dta_offset_characteristics); - dtaMap.setOffset_characteristics(dta_offset_characteristics); - long dta_offset_data = reader.readLongInteger(); - logger.fine("dta_offset_data: " + dta_offset_data); - dtaMap.setOffset_data(dta_offset_data); - long dta_offset_strls = reader.readLongInteger(); - logger.fine("dta_offset_strls: " + dta_offset_strls); - dtaMap.setOffset_strls(dta_offset_strls); - long dta_offset_value_labels = reader.readLongInteger(); - logger.fine("dta_offset_value_labels: " + dta_offset_value_labels); - dtaMap.setOffset_vallabs(dta_offset_value_labels); - long dta_offset_data_close = reader.readLongInteger(); - logger.fine("dta_offset_data_close: " + dta_offset_data_close); - dtaMap.setOffset_data_close(dta_offset_data_close); - long dta_offset_eof = reader.readLongInteger(); - logger.fine("dta_offset_eof: " + dta_offset_eof); - dtaMap.setOffset_eof(dta_offset_eof); - - reader.readClosingTag(TAG_MAP); - - } - - /* - * Variable type information is stored in the ... - * section, as number_of_variables * 2 byte values. - * the type codes are defined as follows: - * (TODO: ...) - */ - - private void readVariableTypes(DataReader reader) throws IOException { - // TODO: - // check that we are at the right byte offset! - logger.fine("Type section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_types()); - reader.readOpeningTag(TAG_VARIABLE_TYPES); - - List variableList = new ArrayList(); - // setup variableTypeList - variableTypes = new String[dataTable.getVarQuantity().intValue()]; - - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - int type = reader.readShortInteger(); - logger.fine("variable "+i+": type="+type); - DataVariable dv = new DataVariable(); - - dv.setInvalidRanges(new ArrayList()); - dv.setSummaryStatistics( new ArrayList()); - dv.setCategories(new ArrayList()); - - dv.setUnf("UNF:pending"); - dv.setFileOrder(i); - dv.setDataTable(dataTable); - - variableTypes[i] = configureVariableType(dv, type); - // TODO: - // we could also calculate the byte offset table now, rather - // then figure it out later... - ? - - variableList.add(dv); - - } - - - reader.readClosingTag(TAG_VARIABLE_TYPES); - dataTable.setDataVariables(variableList); - - } - - // TODO: - // calculate bytes_per_row while we are here -- ? - private String configureVariableType(DataVariable dv, int type) throws IOException { - String typeLabel = null; - - if (variableTypeTable.containsKey(type)) { - typeLabel = variableTypeTable.get(type); - - dv.setTypeNumeric(); - if (typeLabel.equals("Byte") || typeLabel.equals("Integer") || typeLabel.equals("Long")) { - // these are treated as discrete: - dv.setIntervalDiscrete(); - - } else if (typeLabel.equals("Float") || typeLabel.equals("Double")) { - // these are treated as contiuous: - dv.setIntervalContinuous(); - - } else { - throw new IOException("Unrecognized type label: " + typeLabel + " for Stata type value (short) " + type + "."); - } - - } else { - // String: - // - // 32768 - flexible length STRL; - // 1 ... 2045 - fixed-length STRF; - - if (type == 32768) { - typeLabel = "STRL"; - hasSTRLs = true; - - } else if (type > 0 && type < 2046) { - typeLabel = "STR" + type; - } else { - throw new IOException("unknown variable type value encountered: " + type); - } - - dv.setTypeCharacter(); - dv.setIntervalDiscrete(); - } - - return typeLabel; - - } - - /* - * Variable Names are stored as number_of_variables * 33 byte long - * (zero-padded and zero-terminated) character vectors. - */ - private void readVariableNames(DataReader reader) throws IOException { - logger.fine("Variable names section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_varnames()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_VARIABLE_NAMES); - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - String variableName = reader.readString(33); - logger.fine("variable "+i+": name=" + variableName); - if ((variableName != null) && (!variableName.equals(""))) { - dataTable.getDataVariables().get(i).setName(variableName); - } else { - // TODO: decide if we should throw an exception here. - } - } - - reader.readClosingTag(TAG_VARIABLE_NAMES); - } - - /* - * TODO: add a comment - */ - - private void readSortOrder(DataReader reader) throws IOException { - logger.fine("Sort Order section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_srtlist()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_SORT_ORDER); - - // TODO: initialize DataVariable objects here - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - int order = reader.readShortInteger(); - logger.fine("variable "+i+": sort order="+order); - // TODO: - // Double-check that we don't really need this sort order - // for any practical purposes. - // -- L.A. 4.0 beta 8 - } - - // Important! - // The SORT ORDER section (5.5 in the doc) always contains - // number_of_variables + 1 short (2-byte) integers! - - int terminatingShort = reader.readShortInteger(); - - reader.readClosingTag(TAG_SORT_ORDER); - } - - /* - * TODO: add a comment - */ - /* Variable Formats are used exclusively for time and date variables. - * (TODO: but should we be using the decimal formats and such too? -- 4.0 beta 8) - * -- L.A. 4.0 - */ - - private void readDisplayFormats(DataReader reader) throws IOException { - logger.fine("Formats section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_fmts()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_DISPLAY_FORMATS); - dateVariableFormats = new String[dataTable.getVarQuantity().intValue()]; - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - String variableFormat = reader.readString(49); - logger.fine("variable "+i+": displayFormat=" + variableFormat); - // TODO: - // Decide what we are doing with these. - // (saving them, for archival purposes?) - - // this is from the old plugin: - // TODO: review! - - String variableFormatKey = null; - if (variableFormat.startsWith("%t")) { - variableFormatKey = variableFormat.substring(0, 3); - } else { - variableFormatKey = variableFormat.substring(0, 2); - } - logger.fine(i + " th variableFormatKey=" + variableFormatKey); - - /* - * Now, let's check if this format is a known time or date format. - * If so, note that this changes the storage type of the variable! - * i.e., times and dates are stored as binary numeric values, but on - * the DVN side/in the tab files they will become strings. - * TODO: it kinda does look like we can get rid of the variableFormats[] - * list; these formats are only used if this is a recognized - * "date/time datum" (see below); so then it looks like we can - * extract this info from the DataVariable "formatschemaname". - * -- L.A. 4.0 - */ - if (DATE_TIME_FORMAT_TABLE.containsKey(variableFormatKey)) { - dateVariableFormats[i] = variableFormat; - dataTable.getDataVariables().get(i).setFormatCategory(DATE_TIME_FORMAT_TABLE.get(variableFormatKey)); - logger.fine(i + "th var: category=" + - DATE_TIME_FORMAT_TABLE.get(variableFormatKey)); - dataTable.getDataVariables().get(i).setTypeCharacter(); - dataTable.getDataVariables().get(i).setIntervalDiscrete(); - } - } - - reader.readClosingTag(TAG_DISPLAY_FORMATS); - } - - /* - * Another fixed-field section - */ - private void readValueLabelFormatNames(DataReader reader) throws IOException { - logger.fine("Category valuable section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_vlblnames()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_VALUE_LABEL_FORMAT_NAMES); - - valueLabelsLookupTable = new String[dataTable.getVarQuantity().intValue()]; - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - // TODO: - // Define all the byte lengths as constants! - String valueLabelFormat = reader.readString(33); - logger.fine("variable "+i+": value label format=" + valueLabelFormat); - if ((valueLabelFormat != null) && (!valueLabelFormat.equals(""))) { - valueLabelsLookupTable[i] = valueLabelFormat; - } - } - - reader.readClosingTag(TAG_VALUE_LABEL_FORMAT_NAMES); - - } - - /* - * Another fixed-field section - */ - private void readVariableLabels(DataReader reader) throws IOException { - logger.fine("Variable labels section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_varlabs()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_VARIABLE_LABELS); - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - String variableLabel = reader.readString(81); - logger.fine("variable "+i+": label=" + variableLabel); - if ((variableLabel != null) && (!variableLabel.equals(""))) { - dataTable.getDataVariables().get(i).setLabel(variableLabel); - } - } - - reader.readClosingTag(TAG_VARIABLE_LABELS); - } - - /* - * TODO: add a comment - */ - private void readCharacteristics(DataReader reader) throws IOException { - logger.fine("Characteristics section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_characteristics()); - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_CHARACTERISTICS); - - reader.skipDefinedSections(TAG_CHARACTERISTICS_SUBSECTION); - - reader.readClosingTag(TAG_CHARACTERISTICS); - - } - - /* - * TODO: add comments. - */ - private void readData(DataReader reader) throws IOException { - logger.fine("Data section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_data()); - logger.fine("readData(): start"); - reader.readOpeningTag(TAG_DATA); - // TODO: - // check that we are at the right byte offset! - - int nvar = dataTable.getVarQuantity().intValue(); - int nobs = dataTable.getCaseQuantity().intValue(); - - int[] variableByteLengths = getVariableByteLengths(variableTypes); - int bytes_per_row = calculateBytesPerRow(variableByteLengths); - - logger.fine("data dimensions[observations x variables] = (" + nobs + "x" + nvar + ")"); - logger.fine("bytes per row=" + bytes_per_row + " bytes"); - logger.fine("variableTypes=" + Arrays.deepToString(variableTypes)); - - // create a File object to save the tab-delimited data file - FileOutputStream fileOutTab = null; - PrintWriter pwout = null; - File tabDelimitedDataFile = File.createTempFile("tempTabfile.", ".tab"); - - // save the temp tab-delimited file in the return ingest object: - ingesteddata.setTabDelimitedFile(tabDelimitedDataFile); - - fileOutTab = new FileOutputStream(tabDelimitedDataFile); - pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); - - logger.fine("Beginning to read data stream."); - - for (int i = 0; i < nobs; i++) { - //byte[] dataRowBytes = new byte[bytes_per_row]; - Object[] dataRow = new Object[nvar]; - - //int nbytes = stream.read(dataRowBytes, 0, bytes_per_row); - //dataRowBytes = reader.readBytes(bytes_per_row); - // TODO: - // maybe intercept any potential exceptions here, and add more - // diagnostic info, before re-throwing... - int byte_offset = 0; - for (int columnCounter = 0; columnCounter < nvar; columnCounter++) { - - String varType = variableTypes[columnCounter]; - - // 4.0 Check if this is a time/date variable: - boolean isDateTimeDatum = false; - // TODO: - // make sure the formats are properly set! -- use the old - // plugin as a model... - String formatCategory = dataTable.getDataVariables().get(columnCounter).getFormatCategory(); - if (formatCategory != null && (formatCategory.equals("time") || formatCategory.equals("date"))) { - isDateTimeDatum = true; - } - - // TODO: - // ditto - String variableFormat = dateVariableFormats[columnCounter]; - - if (varType == null || varType.equals("")) { - throw new IOException("Undefined variable type encountered in readData()"); - } - - // TODO: - // double-check that the missing values constants are still correct! - if (varType.equals("Byte")) { - // (signed) Byte - byte byte_datum = reader.readSignedByte(); - - logger.fine(i + "-th row " + columnCounter - + "=th column byte =" + byte_datum); - if (byte_datum >= BYTE_MISSING_VALUE) { - logger.fine(i + "-th row " + columnCounter - + "=th column byte MV=" + byte_datum); - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - } else { - dataRow[columnCounter] = byte_datum; - logger.fine(i + "-th row " + columnCounter + - "-th column byte value=" + byte_datum); - } - - byte_offset++; - } else if (varType.equals("Integer")) { - short short_datum = (short) reader.readShortSignedInteger(); - - logger.fine(i + "-th row " + columnCounter - + "=th column stata int =" + short_datum); - - if (short_datum >= INT_MISSIG_VALUE) { - logger.fine(i + "-th row " + columnCounter - + "=th column stata long missing value=" + short_datum); - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - } else { - - if (isDateTimeDatum) { - - DecodedDateTime ddt = decodeDateTimeData("short", variableFormat, Short.toString(short_datum)); - logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); - dataRow[columnCounter] = ddt.decodedDateTime; - //dateFormat[columnCounter][i] = ddt.format; - dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); - - } else { - dataRow[columnCounter] = short_datum; - logger.fine(i + "-th row " + columnCounter + - "-th column \"integer\" value=" + short_datum); - } - } - byte_offset += 2; - } else if (varType.equals("Long")) { - // stata-Long (= java's int: 4 byte), signed. - - int int_datum = reader.readSignedInteger(); - - if (int_datum >= LONG_MISSING_VALUE) { - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - } else { - if (isDateTimeDatum) { - DecodedDateTime ddt = decodeDateTimeData("int", variableFormat, Integer.toString(int_datum)); - logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); - dataRow[columnCounter] = ddt.decodedDateTime; - dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); - - } else { - dataRow[columnCounter] = int_datum; - logger.fine(i + "-th row " + columnCounter + - "-th column \"long\" value=" + int_datum); - } - - } - byte_offset += 4; - } else if (varType.equals("Float")) { - // STATA float - // same as Java float - 4-byte - - float float_datum = reader.readFloat(); - - logger.fine(i + "-th row " + columnCounter - + "=th column float =" + float_datum); - if (FLOAT_MISSING_VALUE_SET.contains(float_datum)) { - logger.fine(i + "-th row " + columnCounter - + "=th column float missing value=" + float_datum); - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - - } else { - - if (isDateTimeDatum) { - DecodedDateTime ddt = decodeDateTimeData("float", variableFormat, doubleNumberFormatter.format(float_datum)); - logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); - dataRow[columnCounter] = ddt.decodedDateTime; - dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); - } else { - dataRow[columnCounter] = float_datum; - logger.fine(i + "-th row " + columnCounter - + "=th column float value:" + float_datum); - // This may be temporary - but for now (as in, while I'm testing - // 4.0 ingest against 3.* ingest, I need to be able to tell if a - // floating point value was a single, or double float in the - // original STATA file: -- L.A. Jul. 2014 - dataTable.getDataVariables().get(columnCounter).setFormat("float"); - // ? - } - - } - byte_offset += 4; - } else if (varType.equals("Double")) { - // STATA double - // same as Java double - 8-byte - - double double_datum = reader.readDouble(); - - if (DOUBLE_MISSING_VALUE_SET.contains(double_datum)) { - logger.finer(i + "-th row " + columnCounter - + "=th column double missing value=" + double_datum); - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - } else { - - if (isDateTimeDatum) { - DecodedDateTime ddt = decodeDateTimeData("double", variableFormat, doubleNumberFormatter.format(double_datum)); - logger.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); - dataRow[columnCounter] = ddt.decodedDateTime; - dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); - } else { - logger.fine(i + "-th row " + columnCounter - + "=th column double value:" + double_datum); //doubleNumberFormatter.format(double_datum)); - - dataRow[columnCounter] = double_datum; //doubleNumberFormatter.format(double_datum); - } - - } - byte_offset += 8; - } else if (varType.matches("^STR[1-9][0-9]*")) { - // String case - int strVarLength = variableByteLengths[columnCounter]; - logger.fine(i + "-th row " + columnCounter - + "=th column is a string (" + strVarLength + " bytes)"); - //String raw_datum = new String(Arrays.copyOfRange(dataRowBytes, byte_offset, - // (byte_offset + strVarLength)), "ISO-8859-1"); - // (old) TODO: - // is it the right thing to do, to default to "ISO-8859-1"? - // (it may be; since there's no mechanism for specifying - // alternative encodings in Stata, this may be their default; - // it just needs to be verified. -- L.A. Jul. 2014) - // ACTUALLY, in STATA13, it appears that STRF *MUST* - // be limited to ASCII. Binary strings can be stored as - // STRLs. (Oct. 6 2014) - - //String string_datum = getNullStrippedString(raw_datum); - String string_datum = reader.readString(strVarLength); - if (string_datum.length() < 64) { - logger.fine(i + "-th row " + columnCounter - + "=th column string =" + string_datum); - } else { - logger.fine(i + "-th row " + columnCounter - + "=th column string =" + string_datum.substring(0, 64) + "... (truncated)"); - } - if (string_datum.equals("")) { - - logger.fine(i + "-th row " + columnCounter - + "=th column string missing value=" + string_datum); - - // TODO: - /* Is this really a missing value case? - * Or is it an honest empty string? - * Is there such a thing as a missing value for a String in Stata? - * -- L.A. 4.0 - */ - dataRow[columnCounter] = MissingValueForTabDelimitedFile; - } else { - /* - * Some special characters, like new lines and tabs need to - * be escaped - otherwise they will break our TAB file - * structure! - */ - - dataRow[columnCounter] = escapeCharacterString(string_datum); - } - byte_offset += strVarLength; - } else if (varType.equals("STRL")) { - //throw new IOException(""); - logger.fine("STRL encountered."); - - if (cachedGSOs == null) { - cachedGSOs = new LinkedHashMap<>(); - } - - // Reading the (v,o) pair: - long v = 0; - long o = 0; - String voPair = null; - // first v: - - v = reader.readInteger(); - byte_offset += 4; - - // then o: - - o = reader.readInteger(); - byte_offset += 4; - - // create v,o pair; save, for now: - - voPair = v + "," + o; - dataRow[columnCounter] = voPair; - - // TODO: - // Validate v and o? - // Making sure v <= varNum and o < numbObs; - // or, if o == numObs, v <= columnCounter; - // -- per the Stata 13 spec... - - if (!(v == columnCounter + 1 && o == i + 1)) { - if (!cachedGSOs.containsKey(voPair)) { - cachedGSOs.put(voPair, ""); - // this means we need to cache this GSO, when - // we read the STRLS section later on. - } - } - - } else { - logger.warning("unknown variable type found: " + varType); - String errorMessage - = "unknown variable type encounted when reading data section: " + varType; - //throw new InvalidObjectException(errorMessage); - throw new IOException(errorMessage); - - } - } // for (columnCounter) - - if (byte_offset != bytes_per_row) { - throw new IOException("Unexpected number of bytes read for data row " + i + "; " + bytes_per_row + " expected, " + byte_offset + " read."); - } - - // Dump the row of data to the tab-delimited file: - pwout.println(StringUtils.join(dataRow, "\t")); - - logger.fine("finished reading "+i+"-th row"); - - - } // for (rows) - - pwout.close(); - - reader.readClosingTag(TAG_DATA); - logger.fine("DTA117 Ingest: readData(): end."); - - } - - /* - * STRLs: - * (simply skipping these, for now) - */ - - private void readSTRLs(DataReader reader) throws IOException { - logger.fine("STRLs section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_strls()); - // TODO: - // check that we are at the right byte offset! - //reader.readOpeningTag(TAG_STRLS); - - if (hasSTRLs) { - reader.readOpeningTag(TAG_STRLS); - - File intermediateTabFile = ingesteddata.getTabDelimitedFile(); - FileInputStream fileInTab = new FileInputStream(intermediateTabFile); - - Scanner scanner = new Scanner(fileInTab); - scanner.useDelimiter("\\n"); - - File finalTabFile = File.createTempFile("finalTabfile.", ".tab"); - FileOutputStream fileOutTab = new FileOutputStream(finalTabFile); - PrintWriter pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); - - logger.fine("Setting the tab-delimited file to "+finalTabFile.getName()); - ingesteddata.setTabDelimitedFile(finalTabFile); - - int nvar = dataTable.getVarQuantity().intValue(); - int nobs = dataTable.getCaseQuantity().intValue(); - - String[] line; - - for (int obsindex = 0; obsindex < nobs; obsindex++) { - if (scanner.hasNext()) { - line = (scanner.next()).split("\t", -1); - - for (int varindex = 0; varindex < nvar; varindex++) { - if ("STRL".equals(variableTypes[varindex])) { - // this is a STRL; needs to be re-processed: - - String voPair = line[varindex]; - long v; - long o; - if (voPair == null) { - throw new IOException("Failed to read an intermediate v,o Pair for variable "+ - varindex + ", observation "+obsindex); - } - - if ("0,0".equals(voPair)) { - // This is a code for an empty string - ""; - // doesn't need to be defined or looked up. - - line[varindex] = "\"\""; - } else { - String[] voTokens = voPair.split(",", 2); - - try { - v = new Long(voTokens[0]).longValue(); - o = new Long(voTokens[1]).longValue(); - } catch (NumberFormatException nfex) { - throw new IOException("Illegal v,o value: "+voPair+" for variable "+ - varindex + ", observation "+obsindex); - } - - if (v == varindex + 1 && o == obsindex + 1) { - // This v,o must be defined in the STRLs section: - line[varindex] = readGSO(reader, v, o); - if (line[varindex] == null) { - throw new IOException ("Failed to read GSO value for "+voPair); - } - - } else { - // This one must have been cached already: - if (cachedGSOs.get(voPair) != null && - !cachedGSOs.get(voPair).equals("")) { - line[varindex] = cachedGSOs.get(voPair); - } else { - throw new IOException("GSO string unavailable for v,o value "+voPair); - } - } - } - } - } - // Dump the row of data to the tab-delimited file: - pwout.println(StringUtils.join(line, "\t")); - } - } - - scanner.close(); - pwout.close(); - - reader.readClosingTag(TAG_STRLS); - } else { - // If this data file doesn't use STRLs, we can just skip - // this section, and assume that we are done with the - // tabular data file. - reader.readPrimitiveSection(TAG_STRLS); - } - - //reader.readClosingTag(TAG_STRLS); - } - - private String readGSO(DataReader reader, long v, long o) throws IOException { - if (!reader.checkTag(STRL_GSO_HEAD)) { - return null; - } - - // Skipping the GSO header - fixed string "GSO": - reader.readBytes(STRL_GSO_HEAD.length()); - - // Reading the stored (v,o) pair: - - long vStored = reader.readInteger(); - long oStored = reader.readInteger(); - - String voPair = v + "," + o; - - - if (vStored != v || oStored != o) { - throw new IOException ("GSO reading mismatch: expected v,o pair: "+ - voPair+", found: "+vStored+","+oStored); - } - - short type = reader.readByte(); - boolean binary = false; - - if (type == 129) { - logger.fine("STRL TYPE: binary"); - binary = true; - } else if (type == 130) { - logger.fine("STRL TYPE: ascii"); - } else { - logger.warning("WARNING: unknown STRL type: "+type); - } - - long length = reader.readInteger(); - - logger.fine("Advertised length of the STRL: "+length); - - // TODO: - // length can technically be 0 < length < 2^^32; - // but Java arrays are only [int], i.e., can only have < 2^^31 - // elements; readBytes() allocates and returns a byte[] array. - // so I should probably check the value of length - if it - // can fit into a signed int; not that it's likely to happen - // in real life. -- L.A. 4.0 beta 11 - - byte[] contents = reader.readBytes((int)length); - - // TODO: - // Depending on whether this GSO is advertised as ASCII or binary, - // I should probably create the String as either ASCII or UTF8. - // (and the trailing zero needs to be chopped, if it's the ASCII kind) - // -- L.A. 4.0 beta 11 - - String gsoString = null; - - if (binary) { - gsoString = new String(contents, "utf8"); // ? - } else { - gsoString = new String(contents, 0, (int)length-1, "US-ASCII"); - } - - logger.fine("GSO "+v+","+o+": "+gsoString); - - String escapedGsoString = escapeCharacterString(gsoString); - - if (cachedGSOs.containsKey(voPair)) { - // We need to cache this GSO: - if (!"".equals(cachedGSOs.get(voPair))) { - throw new IOException ("Multiple GSO definitions for v,o "+voPair); - } - cachedGSOs.put(voPair, escapedGsoString); - } - - return escapedGsoString; - } - - private void readValueLabels(DataReader reader) throws IOException { - logger.fine("Value Labels section; at offset "+reader.getByteOffset()+"; dta map offset: "+dtaMap.getOffset_vallabs()); - logger.fine("readValueLabels(): start."); - - // TODO: - // check that we are at the right byte offset! - reader.readOpeningTag(TAG_VALUE_LABELS); - - while (reader.checkTag("<" + TAG_VALUE_LABELS_LBL_DEF + ">")) { - // TODO: checktag should probably *read* the tag, if it is indeed - // encountered, rather then stop at the beginning of the tag. - reader.readOpeningTag(TAG_VALUE_LABELS_LBL_DEF); - long label_table_length = reader.readInteger(); - // TODO: - // think of better variable names... - - String label_table_name = reader.readString(33); - // TODO: - // do we need to worry about uniquness? or has Stata already - // guaranteed that there are no other category value table - // defined under this name? - reader.readBytes(3); // TODO: skipBytes() instead - - long value_category_offset = 0; - - // read the value_label_table that follows. - // should be label_table_length. - int number_of_categories = (int)reader.readInteger(); - long text_length = reader.readInteger(); - - value_category_offset = 8; - - long[] value_label_offsets = new long[number_of_categories]; - long[] category_values = new long[number_of_categories]; - String[] category_value_labels = new String[number_of_categories]; - - for (int i = 0; i < number_of_categories; i++) { - value_label_offsets[i] = reader.readInteger(); - value_category_offset += 4; - } - - for (int i = 0; i < number_of_categories; i++) { - // TODO: - // can the category values be negative? - category_values[i] = reader.readInteger(); - value_category_offset += 4; - } - - int total_label_bytes = 0; - - long label_offset = 0; - long label_end = 0; - int label_length = 0; - - for (int i = 0; i < number_of_categories; i++) { - label_offset = value_label_offsets[i]; - label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length; - label_length = (int)(label_end - label_offset); - - category_value_labels[i] = reader.readString(label_length); - total_label_bytes += label_length; - } - - value_category_offset += total_label_bytes; - - if (total_label_bytes != text_length) { - throw new IOException(""); - } - - if (value_category_offset != label_table_length) { - throw new IOException(""); - } - reader.readClosingTag(TAG_VALUE_LABELS_LBL_DEF); - - // Find the variables that link to this Category Values Table - // and create VariableCategory objects for the corresponding - // DataVariables: - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - if (label_table_name.equals(valueLabelsLookupTable[i])) { - logger.fine("cross-linking value label table for "+label_table_name); - // it is actually a legit condition - when - // a variable is advertised as linked to a category values - // table of a certain name, but no such table exists. - // -- L.A. - for (int j = 0; j < number_of_categories; j++) { - VariableCategory cat = new VariableCategory(); - - long cat_value = category_values[j]; - String cat_label = category_value_labels[j]; - - cat.setValue(""+cat_value); - cat.setLabel(cat_label); - - /* cross-link the variable and category to each other: */ - cat.setDataVariable(dataTable.getDataVariables().get(i)); - dataTable.getDataVariables().get(i).getCategories().add(cat); - } - } - } - } - - reader.readClosingTag(TAG_VALUE_LABELS); - logger.fine("readValueLabels(): end."); - - } - - /* - * Helper methods for decoding data: - */ - - private int calculateBytesPerRow(int[] variableByteLengths) throws IOException { - if (variableByteLengths == null || variableByteLengths.length != dataTable.getVarQuantity()) { - throw new IOException(""); - } - int bytes_per_row = 0; - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - if (variableByteLengths[i] < 1) { - throw new IOException(""); - } - bytes_per_row += variableByteLengths[i]; - } - - return bytes_per_row; - } - - private int[] getVariableByteLengths(String[] variableTypes) throws IOException { - if (variableTypes == null || variableTypes.length != dataTable.getVarQuantity()) { - throw new IOException(""); - } - - int[] variableByteLengths = new int[dataTable.getVarQuantity().intValue()]; - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - variableByteLengths[i] = getVariableByteLength(variableTypes[i]); - } - - return variableByteLengths; - } - - private int getVariableByteLength(String variableType) throws IOException { - int byte_length = 0; - - if (variableType == null || variableType.equals("")) { - throw new IOException(""); - } - if (byteLengthTable.containsKey(variableType)) { - return byteLengthTable.get(variableType); - } - - if (variableType.matches("^STR[1-9][0-9]*")) { - String stringLengthToken = variableType.substring(3); - Integer stringLength = null; - try { - stringLength = new Integer(stringLengthToken); - } catch (NumberFormatException nfe) { - stringLength = null; - } - if (stringLength == null || stringLength.intValue() < 1 || stringLength.intValue() > 2045) { - throw new IOException("Invalid STRF encountered: " + variableType); - } - return stringLength.intValue(); - } - - throw new IOException ("Unknown/invalid variable type: "+variableType); - } - - private class DecodedDateTime { - String format; - String decodedDateTime; - } - - private DecodedDateTime decodeDateTimeData(String storageType, String FormatType, String rawDatum) throws IOException { - - logger.fine("(storageType, FormatType, rawDatum)=(" - + storageType + ", " + FormatType + ", " + rawDatum + ")"); - /* - * Historical note: - pseudofunctions, td(), tw(), tm(), tq(), and th() - used to be called d(), w(), m(), q(), and h(). - Those names still work but are considered anachronisms. - - */ - - long milliSeconds; - String decodedDateTime=null; - String format = null; - - if (FormatType.matches("^%tc.*")){ - // tc is a relatively new format - // datum is millisecond-wise - - milliSeconds = Long.parseLong(rawDatum)+ STATA_BIAS_TO_EPOCH; - decodedDateTime = sdf_ymdhmsS.format(new Date(milliSeconds)); - format = sdf_ymdhmsS.toPattern(); - logger.fine("tc: result="+decodedDateTime+", format = "+format); - - } else if (FormatType.matches("^%t?d.*")){ - milliSeconds = Long.parseLong(rawDatum)*SECONDS_PER_YEAR + STATA_BIAS_TO_EPOCH; - logger.fine("milliSeconds="+milliSeconds); - - decodedDateTime = sdf_ymd.format(new Date(milliSeconds)); - format = sdf_ymd.toPattern(); - logger.fine("td:"+decodedDateTime+", format = "+format); - - } else if (FormatType.matches("^%t?w.*")){ - - long weekYears = Long.parseLong(rawDatum); - long left = Math.abs(weekYears)%52L; - long years; - if (weekYears < 0L){ - left = 52L - left; - if (left == 52L){ - left = 0L; - } - //out.println("left="+left); - years = (Math.abs(weekYears) -1)/52L +1L; - years *= -1L; - } else { - years = weekYears/52L; - } - - String yearString = Long.valueOf(1960L + years).toString(); - String dayInYearString = new DecimalFormat("000").format((left*7) + 1).toString(); - String yearDayInYearString = yearString + "-" + dayInYearString; - - Date tempDate = null; - try { - tempDate = new SimpleDateFormat("yyyy-DDD").parse(yearDayInYearString); - } catch (ParseException ex) { - throw new IOException(ex); - } - - decodedDateTime = sdf_ymd.format(tempDate.getTime()); - format = sdf_ymd.toPattern(); - - } else if (FormatType.matches("^%t?m.*")){ - // month - long monthYears = Long.parseLong(rawDatum); - long left = Math.abs(monthYears)%12L; - long years; - if (monthYears < 0L){ - left = 12L - left; - //out.println("left="+left); - years = (Math.abs(monthYears) -1)/12L +1L; - years *= -1L; - } else { - years = monthYears/12L; - } - - String month = null; - if (left == 12L){ - left = 0L; - } - Long monthdata = (left+1); - month = "-"+twoDigitFormatter.format(monthdata).toString()+"-01"; - long year = 1960L + years; - String monthYear = Long.valueOf(year).toString() + month; - logger.fine("rawDatum="+rawDatum+": monthYear="+monthYear); - - decodedDateTime = monthYear; - format = "yyyy-MM-dd"; - logger.fine("tm:"+decodedDateTime+", format:"+format); - - } else if (FormatType.matches("^%t?q.*")){ - // quater - long quaterYears = Long.parseLong(rawDatum); - long left = Math.abs(quaterYears)%4L; - long years; - if (quaterYears < 0L){ - left = 4L - left; - //out.println("left="+left); - years = (Math.abs(quaterYears) -1)/4L +1L; - years *= -1L; - } else { - years = quaterYears/4L; - } - - String quater = null; - - if ((left == 0L) || (left == 4L)){ - //quater ="q1"; // - quater = "-01-01"; - } else if (left ==1L) { - //quater = "q2"; // - quater = "-04-01"; - } else if (left ==2L) { - //quater = "q3"; // - quater = "-07-01"; - } else if (left ==3L) { - //quater = "q4"; // - quater = "-11-01"; - } - - long year = 1960L + years; - String quaterYear = Long.valueOf(year).toString() + quater; - logger.fine("rawDatum="+rawDatum+": quaterYear="+quaterYear); - - decodedDateTime = quaterYear; - format = "yyyy-MM-dd"; - logger.fine("tq:"+decodedDateTime+", format:"+format); - - } else if (FormatType.matches("^%t?h.*")){ - // half year - // odd number:2nd half - // even number: 1st half - - long halvesYears = Long.parseLong(rawDatum); - long left = Math.abs(halvesYears)%2L; - long years; - if (halvesYears < 0L){ - years = (Math.abs(halvesYears) -1)/2L +1L; - years *= -1L; - } else { - years = halvesYears/2L; - } - - String half = null; - if (left != 0L){ - // odd number => 2nd half: "h2" - //half ="h2"; // - half = "-07-01"; - } else { - // even number => 1st half: "h1" - //half = "h1"; // - half = "-01-01"; - } - long year = 1960L + years; - String halfYear = Long.valueOf(year).toString() + half; - logger.fine("rawDatum="+rawDatum+": halfYear="+halfYear); - - decodedDateTime = halfYear; - format = "yyyy-MM-dd"; - logger.fine("th:"+decodedDateTime+", format:"+format); - - } else if (FormatType.matches("^%t?y.*")){ - // year type's origin is 0 AD - decodedDateTime = rawDatum; - format = "yyyy"; - logger.fine("th:"+decodedDateTime); - } else { - decodedDateTime = rawDatum; - format=null; - } - DecodedDateTime retValue = new DecodedDateTime(); - retValue.decodedDateTime = decodedDateTime; - retValue.format = format; - return retValue; - } - - private class DataReader { - private BufferedInputStream stream; - private int DEFAULT_BUFFER_SIZE = 8192;// * 2; - private byte[] byte_buffer; - private int buffer_size; - private long byte_offset; - private int buffer_byte_offset; - Boolean LSF = null; - - public DataReader(BufferedInputStream stream) throws IOException { - this(stream, 0); - } - - public DataReader(BufferedInputStream stream, int size) throws IOException { - if (buffer_size > 0) { - this.DEFAULT_BUFFER_SIZE = size; - } - this.stream = stream; - byte_buffer = new byte[DEFAULT_BUFFER_SIZE]; - byte_offset = 0; - buffer_byte_offset = 0; - - bufferMoreBytes(); - } - - public BufferedInputStream getStream() { - return stream; - } - - public void setStream(BufferedInputStream stream) { - this.stream = stream; - } - - public void setLSF(boolean lsf) { - LSF = lsf; - } - - public Boolean isLSF() { - return LSF; - } - - // this returns the *absolute* byte offest in the stream. - public long getByteOffset () { - return byte_offset + buffer_byte_offset; - } - - public void setByteOffset(long byte_offset) { - this.byte_offset = byte_offset; - } - - /* - readBytes is the workhorse method of the internal Data Reader class. - it reads the requested number of bytes from the buffer, if available, - refilling the buffer as necessary. - the method allocates the byte array it returns, so there's no need - to do so outside of it. - the method will throw an exception if for whatever reason it cannot - read the requested number of bytes. - */ - public byte[] readBytes(int n) throws IOException { - if (n <= 0) { - throw new IOException("DataReader.readBytes called to read zero or negative number of bytes."); - } - byte[] bytes = new byte[n]; - - if (this.buffer_size - buffer_byte_offset >= n) { - System.arraycopy(byte_buffer, buffer_byte_offset, bytes, 0, n); - buffer_byte_offset+=n; - } else { - int bytes_read = 0; - - /* if there are any bytes left in the buffer, - * copy them into the return array: - */ - - if (this.buffer_size - buffer_byte_offset > 0) { - logger.fine("reading the remaining "+(this.buffer_size - buffer_byte_offset)+" bytes from the buffer"); - System.arraycopy(byte_buffer, buffer_byte_offset, bytes, 0, this.buffer_size - buffer_byte_offset); - //buffer_byte_offset = this.buffer_size; - bytes_read = this.buffer_size - buffer_byte_offset; - } - - int morebytes = bufferMoreBytes(); - logger.fine("buffered "+morebytes+" bytes"); - - /* - * TODO: combine this block with the one above -- ? - * if multiple BUFFER_SIZE-byte worth chunk of data is requested, - * keep reading and buffering: - */ - while (n - bytes_read > this.buffer_size) { - logger.fine("copying a full buffer-worth of bytes into the return array"); - System.arraycopy(byte_buffer, buffer_byte_offset, bytes, bytes_read, this.buffer_size); - //buffer_byte_offset = this.buffer_size; - bytes_read += this.buffer_size; - morebytes = bufferMoreBytes(); - logger.fine("buffered "+morebytes+" bytes"); - } - - /* - * finally, copy the last not-a-full-buffer-worth of bytes - * into the return buffer: - */ - logger.fine("copying the remaining "+(n-bytes_read)+" bytes."); - System.arraycopy(byte_buffer, 0, bytes, bytes_read, n - bytes_read); - buffer_byte_offset = n - bytes_read; - } - - return bytes; - } - - /* - * This method tries to read and buffer the DEFAULT_BUFFER_SIZE bytes - * and sets the current buffer size accordingly. - */ - - private int bufferMoreBytes() throws IOException { - int actual_bytes_read = stream.read(byte_buffer, 0, DEFAULT_BUFFER_SIZE); - - // set the current buffer size to the actual number of - // bytes read: - this.buffer_size = actual_bytes_read; - - // reset the current buffer offset and increment the total - // byte offset by the size of the last buffer - that should be - // equal to the buffer_byte_offset. - // (TODO: check that this is the case!) - - byte_offset += buffer_byte_offset; - buffer_byte_offset = 0; - - return actual_bytes_read; - } - - /* - * Convenience methods for reading single bytes of data. - * Just like with the other types of integers, both the signed and - * unsigned versions are provided. - * The readSignedByte() is used to read STATA *data* stored as - * type "Byte"; the unsigned version is used to read byte values - * in various sections of the file that store the lengths of byte - * sequences that follow. - * Examples: (TODO: ...) - */ - private byte readSignedByte() throws IOException { - /* Why not just use readBytes(1) here, you ask? - * - Because readBytes() will want to allocate a - * return byte[] buffer of size 1. */ - byte ret; - if (buffer_byte_offset > this.buffer_size) { - throw new IOException ("TD - buffer overflow"); - } - if (buffer_byte_offset < this.buffer_size) { - ret = byte_buffer[buffer_byte_offset]; - buffer_byte_offset++; - } else { - if (bufferMoreBytes() < 1) { - throw new IOException("reached the end of data stream prematurely."); - } - ret = byte_buffer[0]; - buffer_byte_offset = 1; - } - return ret; - } - - // Note that readByte() returns the value of Java type "short". - // This is to accommodate value larger than 127. - - private short readByte() throws IOException { - short ret = readSignedByte(); - - if (ret < 0) { - ret += 256; - } - return ret; - } - - /* Various reader methods for reading primitive numeric types; - * these are used both for reading the values from the data section - * (signed integer and floating-point types), and to read numeric - * values encoded as unsigned bytes in various sections of the file, - * advertising the lengths of the data sections that follow. - * Note that the internal methods bytesToInt() and bytesToSignedInt() - * will throw an exception if LSF (byte order flag) has not yet been - * set. - */ - - // Unsigned integer methods readInteger() and readShortInteger() - // below return long (8 byte) and int (4 byte) integers, respectively. - // This is to accommodate the values larger than 2^31-1 and 2^15-1, - // respectively. - - public long readInteger() throws IOException { - return readInteger(4); - } - - public int readSignedInteger() throws IOException { - return readSignedInteger(4); - } - - public int readShortInteger() throws IOException { - return readInteger(2); - } - - public short readShortSignedInteger() throws IOException { - return (short)readSignedInteger(2); - } - - // Only an unsigned version of readLongInteger() is provided; - // This is because STATA does not support 8 byte integer data types. - // 8 byte integers are only used as unsigned values specifying byte - // lengths. - - public long readLongInteger() throws IOException { - byte[] raw_bytes = readBytes(8); - - return bytesToLong(raw_bytes); - } - - // Service readInteger() methods that can read either 2 or 4 - // byte integers: - private int readInteger(int n) throws IOException { - byte[] raw_bytes = readBytes(n); - - return (int)bytesToInt(raw_bytes); - } - - private int readSignedInteger(int n) throws IOException { - byte[] raw_bytes = readBytes(n); - - return bytesToSignedInt(raw_bytes); - } - - // Floating point reader methods: - - public double readDouble() throws IOException { - if (LSF == null) { - throw new IOException("Byte order not determined for reading numeric values."); - } - ByteBuffer double_buffer = ByteBuffer.wrap(readBytes(8)); - if (LSF) { - double_buffer.order(ByteOrder.LITTLE_ENDIAN); - } - double ret = double_buffer.getDouble(); - return ret; - - } - - public float readFloat() throws IOException { - ByteBuffer float_buffer = ByteBuffer.wrap(readBytes(4)); - // TODO: - // this implies that floats are always stored in LSF/little endian... - // verify that this is still true in STATA 13! - float_buffer.order(ByteOrder.LITTLE_ENDIAN); - float ret = float_buffer.getFloat(); - return ret; - } - - /* - * internal service methods used by the methods above, to convert - * bytes into the appropriate numeric types: - */ - - // Unsigned version: - - private long bytesToInt (byte[] raw_bytes) throws IOException { - if (LSF == null) { - throw new IOException("Byte order not determined for reading numeric values."); - } - int n = raw_bytes.length; - - if (n != 2 && n != 4) { - throw new IOException("Unsupported number of bytes in an integer: "+n); - } - long ret = 0; - short unsigned_byte_value = 0; - - for (int i = 0; i < n; i++) { - if (LSF) { - unsigned_byte_value = raw_bytes[i]; - } else { - unsigned_byte_value = raw_bytes[n - i - 1]; - } - - if (unsigned_byte_value < 0) { - unsigned_byte_value += 256; - } - - ret += unsigned_byte_value * (1 << (8*i)); - } - - return ret; - } - - private int bytesToSignedInt(byte[] raw_bytes) throws IOException { - if (LSF == null) { - throw new IOException("Byte order not determined for reading numeric values."); - } - int n = raw_bytes.length; - ByteBuffer byte_buffer - = ByteBuffer.wrap(raw_bytes); - if (LSF) { - byte_buffer.order(ByteOrder.LITTLE_ENDIAN); - - } - int int_value; - if (n == 2) { - int_value = byte_buffer.getShort(); - } else if (n == 4) { - int_value = byte_buffer.getInt(); - } else { - throw new IOException("Unsupported number of bytes for signed integer: "+n); - } - return int_value; - } - - private long bytesToLong (byte[] raw_bytes) throws IOException { - if (raw_bytes.length != 8) { - throw new IOException("Wrong number of bytes in bytesToLong()."); - } - if (LSF == null) { - throw new IOException("Byte order not determined for reading numeric values."); - } - - long ret = 0; - - ByteBuffer byte_buffer - = ByteBuffer.wrap(raw_bytes); - if (LSF) { - byte_buffer.order(ByteOrder.LITTLE_ENDIAN); - - } - ret = byte_buffer.getLong(); - - return ret; - } - - /* - * Method for reading character strings: - * - * readString() reads NULL-terminated strings; i.e. it chops the - * string at the first zero encountered. - * we probably need an alternative, readRawString(), that reads - * a String as is. - */ - - public String readString(int n) throws IOException { - // TODO: - // double-check if variable names have to be ASCII: - // (regardless... this method is used for reading *all sorts* - // of strings, not just variable names - so we should *not* be - // defaulting to ascii, yes??) - // -- L.A. 4.0 beta 8 - - String ret = new String(readBytes(n), "US-ASCII"); - - // Remove the terminating and/or padding zero bytes: - if (ret != null && ret.indexOf(0) > -1) { - return ret.substring(0, ret.indexOf(0)); - } - - return ret; - } - - /* - * More complex helper methods for reading DTA117 "sections" ... - * TODO: document this ... - */ - - public byte[] readPrimitiveSection(String tag) throws IOException { - readOpeningTag(tag); - byte[] ret = readPrimitiveSectionBytes(); - readClosingTag(tag); - return ret; - } - - public byte[] readPrimitiveSection(String tag, int length) throws IOException { - readOpeningTag(tag); - byte[] ret = readBytes(length); - readClosingTag(tag); - return ret; - } - - public String readPrimitiveStringSection(String tag) throws IOException { - return new String(readPrimitiveSection(tag), "US-ASCII"); - } - - public String readPrimitiveStringSection(String tag, int length) throws IOException { - return new String(readPrimitiveSection(tag, length), "US-ASCII"); - } - - /* - * This method reads a string section the length of which is *defined*. - * the format of the section is as follows: - * Lxxxxxx...x - * where L is a single byte specifying the length of the enclosed - * string; followed by L bytes. - * L must be within - * 0 <= L <= limit - * (for example, the "dataset label" is limited to 80 characters). - */ - public String readDefinedStringSection(String tag, int limit) throws IOException { - readOpeningTag(tag); - short number = readByte(); - if (number < 0 || number > limit) { - throw new IOException (""); - } - String ret = null; - if (number > 0) { - ret = new String(readBytes(number), "US-ASCII"); - } - readClosingTag(tag); - return ret; - } - - - - public int readIntegerSection(String tag, int n) throws IOException { - readOpeningTag(tag); - int number = readInteger(n); - readClosingTag(tag); - return number; - } - - // This helper method is used for skipping the llll... sections - // inside the "" section; where llll is a 4-byte unsigned - // int followed by llll bytes. - - public void skipDefinedSections(String tag) throws IOException { - logger.fine("entering at offset "+buffer_byte_offset); - while (checkTag("<" + tag + ">")) { - // TODO: checkTag() should probably *read* the tag, if it is indeed - // encountered, rather then stop at the beginning of the tag. - logger.fine("tag "+tag+" encountered at offset "+buffer_byte_offset); - readOpeningTag(tag); - long number = readInteger(4); - logger.fine(number+" bytes in this section;"); - if (number < 0) { - throw new IOException (""); - } - // TODO: implement skipBytes() instead: - byte[] skipped_bytes = readBytes((int)number); - readClosingTag(tag); - logger.fine("read closing tag ;"); - - - } - logger.fine("exiting at offset "+buffer_byte_offset); - } - - private boolean checkTag(String tag) throws IOException { - if (tag == null || tag.equals("")) { - throw new IOException("opening tag must be a non-empty string."); - } - - int n = tag.length(); - - if (this.buffer_size - buffer_byte_offset >= n) { - return (tag).equals(new String(Arrays.copyOfRange(byte_buffer, buffer_byte_offset, buffer_byte_offset+n),"US-ASCII")); - } else { - throw new IOException("Checking section tags across byte buffers not yet implemented."); - } - } - - - - public void readOpeningTag(String tag) throws IOException { - if (tag == null || tag.equals("")) { - throw new IOException("opening tag must be a non-empty string."); - } - - byte[] openTag = readBytes(tag.length() + 2); - - String openTagString = new String (openTag, "US-ASCII"); - if (openTagString == null || !openTagString.equals("<"+tag+">")) { - throw new IOException("Could not read opening tag <"+tag+">"); - } - } - - public void readClosingTag(String tag) throws IOException { - if (tag == null || tag.equals("")) { - throw new IOException("closing tag must be a non-empty string."); - } - - byte[] closeTag = readBytes(tag.length() + 3); - - String closeTagString = new String (closeTag, "US-ASCII"); - - if (closeTagString == null || !closeTagString.equals("")) { - throw new IOException("Could not read closing tag "); - } - } - - - - private byte[] readPrimitiveSectionBytes() throws IOException { - byte[] cached_bytes = null; - - if (buffer_byte_offset > this.buffer_size) { - throw new IOException("Buffer overflow in DataReader."); - } - if (buffer_byte_offset == this.buffer_size) { - // buffer empty; - bufferMoreBytes(); - } - - int cached_offset = buffer_byte_offset; - - while (byte_buffer[buffer_byte_offset] != '<') { - buffer_byte_offset++; - - if (buffer_byte_offset == this.buffer_size) { - logger.fine("reached the end of buffer in readPrimitiveSectionBytes; offset "+buffer_byte_offset); - cached_bytes = mergeCachedBytes(cached_bytes, cached_offset); - bufferMoreBytes(); - cached_offset = 0; - } - } - - return mergeCachedBytes(cached_bytes, cached_offset); - } - - private byte[] mergeCachedBytes(byte[] cached_bytes, int cached_offset) throws IOException { - - byte[] ret_bytes; - if (cached_bytes == null) { - if (buffer_byte_offset - cached_offset < 0) { - throw new IOException("read error in save local buffer 1; TODO: better exception message"); - } - // empty section - as in
- if (buffer_byte_offset - cached_offset == 0) { - return null; - } - - ret_bytes = new byte[buffer_byte_offset - cached_offset]; - System.arraycopy(byte_buffer, cached_offset, ret_bytes, 0, buffer_byte_offset - cached_offset); - } else { - if (cached_offset != 0) { - throw new IOException("read error in save local buffer 2; TODO: better exception message"); - } - ret_bytes = new byte[cached_bytes.length + buffer_byte_offset]; - System.arraycopy(cached_bytes, 0, ret_bytes, 0, cached_bytes.length); - if (buffer_byte_offset > 0) { - System.arraycopy(byte_buffer, 0, ret_bytes, cached_bytes.length, buffer_byte_offset); - } - } - return ret_bytes; - } - - - - - } - - private class DTADataMap { - private long dta_offset_stata_data = 0; - private long dta_offset_map = 0; - private long dta_offset_variable_types = 0; - private long dta_offset_varnames = 0; - private long dta_offset_sortlist = 0; - private long dta_offset_formats = 0; - private long dta_offset_value_label_names = 0; - private long dta_offset_variable_labels = 0; - private long dta_offset_characteristics = 0; - private long dta_offset_data = 0; - private long dta_offset_strls = 0; - private long dta_offset_value_labels = 0; - private long dta_offset_data_close = 0; - private long dta_offset_eof = 0; - - // getters: - - public long getOffset_head() { - return dta_offset_stata_data; - } - public long getOffset_map() { - return dta_offset_map; - } - public long getOffset_types() { - return dta_offset_variable_types; - } - public long getOffset_varnames() { - return dta_offset_varnames; - } - public long getOffset_srtlist() { - return dta_offset_sortlist; - } - public long getOffset_fmts() { - return dta_offset_formats; - } - public long getOffset_vlblnames() { - return dta_offset_value_label_names; - } - public long getOffset_varlabs() { - return dta_offset_variable_labels; - } - public long getOffset_characteristics() { - return dta_offset_characteristics; - } - public long getOffset_data() { - return dta_offset_data; - } - public long getOffset_strls() { - return dta_offset_strls; - } - public long getOffset_vallabs() { - return dta_offset_value_labels; - } - public long getOffset_data_close() { - return dta_offset_data_close; - } - public long getOffset_eof() { - return dta_offset_eof; - } - - // setters: - - public void setOffset_head(long dta_offset_stata_data) { - this.dta_offset_stata_data = dta_offset_stata_data; - } - public void setOffset_map(long dta_offset_map) { - this.dta_offset_map = dta_offset_map; - } - public void setOffset_types(long dta_offset_variable_types) { - this.dta_offset_variable_types = dta_offset_variable_types; - } - public void setOffset_varnames(long dta_offset_varnames) { - this.dta_offset_varnames = dta_offset_varnames; - } - public void setOffset_srtlist(long dta_offset_sortlist) { - this.dta_offset_sortlist = dta_offset_sortlist; - } - public void setOffset_fmts(long dta_offset_formats) { - this.dta_offset_formats = dta_offset_formats; - } - public void setOffset_vlblnames(long dta_offset_value_label_names) { - this.dta_offset_value_label_names = dta_offset_value_label_names; - } - public void setOffset_varlabs(long dta_offset_variable_labels) { - this.dta_offset_variable_labels = dta_offset_variable_labels; - } - public void setOffset_characteristics(long dta_offset_characteristics) { - this.dta_offset_characteristics = dta_offset_characteristics; - } - public void setOffset_data(long dta_offset_data) { - this.dta_offset_data = dta_offset_data; - } - public void setOffset_strls(long dta_offset_strls) { - this.dta_offset_strls = dta_offset_strls; - } - public void setOffset_vallabs(long dta_offset_value_labels) { - this.dta_offset_value_labels = dta_offset_value_labels; - } - public void setOffset_data_close(long dta_offset_data_close) { - this.dta_offset_data_close = dta_offset_data_close; - } - public void setOffset_eof(long dta_offset_eof) { - this.dta_offset_eof = dta_offset_eof; - } - } -} - diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java index ad303925fb8..8eda6d00ef5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java @@ -63,6 +63,9 @@ */ public class DTAFileReader extends TabularDataFileReader{ + + private static final Logger logger = Logger.getLogger(DTAFileReader.class.getCanonicalName()); + //@Inject //VariableServiceBean varService; // static fields, STATA-specific constants, etc. @@ -545,8 +548,13 @@ private void decodeHeader(BufferedInputStream stream) throws IOException { + new String(Hex.encodeHex(magic_number)) + "<-"); } + logger.info("magic_number[0]: " + magic_number[0]); + logger.info("magic_number[1]: " + magic_number[1]); + logger.info("magic_number[2]: " + magic_number[2]); if (magic_number[2] != 1) { dbgLog.fine("3rd byte is not 1: given file is not stata-dta type"); + // FIXME: Figure out the magic number for Stata 14. + // FIXME: Figure out the magic number for Stata 15. throw new IllegalArgumentException("The file is not in a STATA format that we can read or support."); } else if ((magic_number[1] != 1) && (magic_number[1] != 2)) { dbgLog.fine("2nd byte is neither 0 nor 1: this file is not stata-dta type"); @@ -2013,7 +2021,6 @@ used to be called d(), w(), m(), q(), and h(). Those names still work but are considered anachronisms. */ - long milliSeconds; String decodedDateTime=null; String format = null; @@ -2022,7 +2029,7 @@ used to be called d(), w(), m(), q(), and h(). // tc is a relatively new format // datum is millisecond-wise - milliSeconds = Long.parseLong(rawDatum)+ STATA_BIAS_TO_EPOCH; + milliSeconds = Math.round(new Double(rawDatum)) + STATA_BIAS_TO_EPOCH; decodedDateTime = sdf_ymdhmsS.format(new Date(milliSeconds)); format = sdf_ymdhmsS.toPattern(); if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("tc: result="+decodedDateTime+", format = "+format); @@ -2037,7 +2044,7 @@ used to be called d(), w(), m(), q(), and h(). } else if (FormatType.matches("^%t?w.*")){ - long weekYears = Long.parseLong(rawDatum); + long weekYears = Math.round(new Double(rawDatum)); long left = Math.abs(weekYears)%52L; long years; if (weekYears < 0L){ @@ -2068,7 +2075,7 @@ used to be called d(), w(), m(), q(), and h(). } else if (FormatType.matches("^%t?m.*")){ // month - long monthYears = Long.parseLong(rawDatum); + long monthYears = Math.round(new Double(rawDatum)); long left = Math.abs(monthYears)%12L; long years; if (monthYears < 0L){ @@ -2096,7 +2103,7 @@ used to be called d(), w(), m(), q(), and h(). } else if (FormatType.matches("^%t?q.*")){ // quater - long quaterYears = Long.parseLong(rawDatum); + long quaterYears = Math.round(new Double(rawDatum)); long left = Math.abs(quaterYears)%4L; long years; if (quaterYears < 0L){ diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReader.java new file mode 100644 index 00000000000..f321f507d30 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReader.java @@ -0,0 +1,482 @@ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.logging.Logger; + +public class DataReader { + private static Logger logger = Logger.getLogger(DTAFileReader.class.getPackage().getName()); + private BufferedInputStream stream; + private int DEFAULT_BUFFER_SIZE = 8192;// * 2; + private byte[] buffer; + private int buffer_size; + private long byte_offset; + private int buffer_byte_offset; + private Boolean LSF = null; + + public DataReader(BufferedInputStream stream) throws IOException { + this(stream, 0); + } + + public DataReader(BufferedInputStream stream, int size) throws IOException { + if (buffer_size > 0) { + this.DEFAULT_BUFFER_SIZE = size; + } + this.stream = stream; + buffer = new byte[DEFAULT_BUFFER_SIZE]; + byte_offset = 0; + buffer_byte_offset = 0; + + bufferMoreBytes(); + } + + public void setLSF(boolean lsf) { + LSF = lsf; + } + + // this returns the *absolute* byte offest in the stream. + public long getByteOffset() { + return byte_offset + buffer_byte_offset; + } + + /* + readBytes is the workhorse method of the internal Data Reader class. + it reads the requested number of bytes from the buffer, if available, + refilling the buffer as necessary. + the method allocates the byte array it returns, so there's no need + to do so outside of it. + the method will throw an exception if for whatever reason it cannot + read the requested number of bytes. + */ + public byte[] readBytes(int n) throws IOException { + if (n <= 0) { + throw new IOException("DataReader.readBytes called to read zero or negative number of bytes."); + } + byte[] bytes = new byte[n]; + + if (this.buffer_size - buffer_byte_offset >= n) { + System.arraycopy(buffer, buffer_byte_offset, bytes, 0, n); + buffer_byte_offset += n; + } else { + int bytes_read = 0; + + // copy any bytes left in the buffer into the return array: + if (this.buffer_size - buffer_byte_offset > 0) { + logger.fine("reading the remaining " + (this.buffer_size - buffer_byte_offset) + " bytes from the buffer"); + System.arraycopy(buffer, buffer_byte_offset, bytes, 0, this.buffer_size - buffer_byte_offset); + //buffer_byte_offset = this.buffer_size; + bytes_read = this.buffer_size - buffer_byte_offset; + buffer_byte_offset = this.buffer_size; + } + + int morebytes = bufferMoreBytes(); + logger.fine("buffered " + morebytes + " bytes"); + + /* + * keep reading and buffering buffer-size chunks, until + * we read the requested number of bytes. + */ + while (n - bytes_read > this.buffer_size) { + logger.fine("copying a full buffer-worth of bytes into the return array"); + System.arraycopy(buffer, buffer_byte_offset, bytes, bytes_read, this.buffer_size); + //buffer_byte_offset = this.buffer_size; + bytes_read += this.buffer_size; + buffer_byte_offset = this.buffer_size; + morebytes = bufferMoreBytes(); + logger.fine("buffered "+morebytes+" bytes"); + } + + /* + * finally, copy the last not-a-full-buffer-worth of bytes + * into the return buffer: + */ + logger.fine("copying the remaining " + (n - bytes_read) + " bytes."); + System.arraycopy(buffer, 0, bytes, bytes_read, n - bytes_read); + buffer_byte_offset = n - bytes_read; + } + + return bytes; + } + + /* + * This method tries to read and buffer the DEFAULT_BUFFER_SIZE bytes + * and sets the current buffer size accordingly. + */ + private int bufferMoreBytes() throws IOException { + int actual_bytes_read; + byte_offset += buffer_byte_offset; + + if (byte_offset == 0 || buffer_byte_offset == buffer_size) { + actual_bytes_read = stream.read(buffer, 0, DEFAULT_BUFFER_SIZE); + // set the current buffer size to the actual number of + // bytes read: + this.buffer_size = actual_bytes_read; + + // reset the current buffer offset and increment the total + // byte offset by the size of the last buffer - that should be + // equal to the buffer_byte_offset. + + } else if (buffer_byte_offset < buffer_size) { + System.arraycopy(buffer, buffer_byte_offset, buffer, 0, buffer_size - buffer_byte_offset); + this.buffer_size = buffer_size - buffer_byte_offset; + actual_bytes_read = stream.read(buffer, buffer_size, DEFAULT_BUFFER_SIZE - buffer_size); + buffer_size += actual_bytes_read; + + } else { + throw new IOException("Offset already past the buffer boundary"); + } + buffer_byte_offset = 0; + + return actual_bytes_read; + } + + /* + * Checks that LSF is not null, and sets the buffer byte order accordingly + */ + private void checkLSF(ByteBuffer buffer) throws IOException{ + if (LSF == null) { + throw new IOException("Byte order not determined for reading numeric values."); + } else if (LSF) { + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + } + + /* + * Convenience methods for reading single bytes of data. + * Just like with the other types of integers, both the signed and + * unsigned versions are provided. + * The readByte() is used to read STATA *data* stored as + * type "Byte"; the unsigned version is used to read byte values + * in various sections of the file that store the lengths of byte + * sequences that follow. + */ + public byte readByte() throws IOException { + /* Why not just use readBytes(1) here, you ask? + * - Because readBytes() will want to allocate a + * return byte[] buffer of size 1. */ + byte ret; + if (buffer_byte_offset > this.buffer_size) { + throw new IOException("TD - buffer overflow"); + } else if (buffer_byte_offset < this.buffer_size) { + ret = buffer[buffer_byte_offset]; + buffer_byte_offset++; + } else { + if (bufferMoreBytes() < 1) { + throw new IOException("reached the end of data stream prematurely."); + } + ret = buffer[0]; + buffer_byte_offset = 1; + } + return ret; + } + + // Note that readUByte() returns the value of Java type "short". + // This is to accommodate value larger than 127. + public short readUByte() throws IOException { + short ret = readByte(); + if (ret < 0) { + ret += 256; + } + return ret; + } + + /* Various reader methods for reading primitive numeric types; + * these are used both for reading the values from the data section + * (signed integer and floating-point types), and to read numeric + * values encoded as unsigned bytes in various sections of the file, + * advertising the lengths of the data sections that follow. + * Note that the internal methods bytesToInt() and bytesToSignedInt() + * will throw an exception if LSF (byte order flag) has not yet been + * set. + */ + // Unsigned integer methods readUInt() and readUShort() + // return long (8 byte) and int (4 byte) integers for overflow reasons + public int readUShort() throws IOException { + return (int) readULong(2); + } + + public long readUInt() throws IOException { + return readULong(4); + } + + public long readULong() throws IOException { + return readULong(8); + } + + public short readShort() throws IOException { + ByteBuffer byte_buffer = ByteBuffer.wrap(readBytes(2)); + checkLSF(byte_buffer); + return byte_buffer.getShort(); + } + + public int readInt() throws IOException { + ByteBuffer byte_buffer = ByteBuffer.wrap(readBytes(4)); + checkLSF(byte_buffer); + return byte_buffer.getInt(); + } + + public long readULong(int n) throws IOException { + byte[] raw_bytes = readBytes(n); + if (LSF == null) { + throw new IOException("Byte order not determined for reading numeric values."); + } + + if (n != 2 && n != 4 && n != 6 && n != 8) { + throw new IOException("Unsupported number of bytes in an integer: " + n); + } + long ret = 0; + short unsigned_byte_value; + + for (int i = 0; i < n; i++) { + if (LSF) { + unsigned_byte_value = raw_bytes[i]; + } else { + unsigned_byte_value = raw_bytes[n - i - 1]; + } + + if (unsigned_byte_value < 0) { + unsigned_byte_value += 256; + } + + ret += unsigned_byte_value * (1L << (8 * i)); + } + if(ret < 0){ + throw new IOException("Sorry for hoping this wouldn't be used with values over 2^63-1"); + } + return ret; + } + + // Floating point reader methods: + public double readDouble() throws IOException { + ByteBuffer byte_buffer = ByteBuffer.wrap(readBytes(8)); + checkLSF(byte_buffer); + return byte_buffer.getDouble(); + } + + public float readFloat() throws IOException { + ByteBuffer byte_buffer = ByteBuffer.wrap(readBytes(4)); + checkLSF(byte_buffer); + return byte_buffer.getFloat(); + } + + + /* + * Method for reading character strings: + * + * readString() reads NULL-terminated strings; i.e. it chops the + * string at the first zero encountered. + * we probably need an alternative, readRawString(), that reads + * a String as is. + */ + public String readString(int n) throws IOException { + + String ret = new String(readBytes(n), "US-ASCII"); + + // Remove the terminating and/or padding zero bytes: + if (ret != null && ret.indexOf(0) > -1) { + return ret.substring(0, ret.indexOf(0)); + } + return ret; + } + + /* + * More complex helper methods for reading NewDTA "sections" ... + */ + public byte[] readPrimitiveSection(String tag) throws IOException { + readOpeningTag(tag); + byte[] ret = readPrimitiveSectionBytes(); + readClosingTag(tag); + return ret; + } + + public byte[] readPrimitiveSection(String tag, int length) throws IOException { + readOpeningTag(tag); + byte[] ret = readBytes(length); + readClosingTag(tag); + return ret; + } + + public String readPrimitiveStringSection(String tag) throws IOException { + return new String(readPrimitiveSection(tag), "US-ASCII"); + } + + public String readPrimitiveStringSection(String tag, int length) throws IOException { + return new String(readPrimitiveSection(tag, length), "US-ASCII"); + } + + public String readLabelSection(String tag, int limit) throws IOException { + readOpeningTag(tag); + /** + * ll The byte length of the UTF-8 characters, whose length is + * recorded in a 2-byte unsigned integer encoded according to + * byteorder. + */ + int lengthOfLabel = readUShort(); + logger.fine("length of label: " + lengthOfLabel); + String label = null; + if (lengthOfLabel > 0) { + label = new String(readBytes(lengthOfLabel), "US-ASCII"); + } + logger.fine("ret: " + label); + readClosingTag(tag); + return label; + } + + /* + * This method reads a string section the length of which is *defined*. + * the format of the section is as follows: + * Lxxxxxx...x + * where L is a single byte specifying the length of the enclosed + * string; followed by L bytes. + * L must be within + * 0 <= L <= limit + * (for example, the "dataset label" is limited to 80 characters). + */ + public String readDefinedStringSection(String tag, int limit) throws IOException { + readOpeningTag(tag); + short number = readUByte(); + logger.fine("number: " + number); + if (number < 0 || number > limit) { + throw new IOException(""); + } + String ret = null; + if (number > 0) { + ret = new String(readBytes(number), "US-ASCII"); + } + logger.fine("ret: " + ret); + readClosingTag(tag); + return ret; + } + + public long readIntegerSection(String tag, int n) throws IOException { + readOpeningTag(tag); + long number = readULong(n); + readClosingTag(tag); + return number; + } + + // This helper method is used for skipping the llll... sections + // inside the "" section; where llll is a 4-byte unsigned + // int followed by llll bytes. + public void skipDefinedSections(String tag) throws IOException { + logger.fine("entering at offset " + buffer_byte_offset); + while (checkTag("<" + tag + ">")) { + logger.fine("tag " + tag + " encountered at offset " + buffer_byte_offset); + readOpeningTag(tag); + long number = readULong(4); + logger.fine(number + " bytes in this section;"); + if (number < 0) { + throw new IOException(""); + } + byte[] skipped_bytes = readBytes((int) number); + readClosingTag(tag); + logger.fine("read closing tag ;"); + + } + logger.fine("exiting at offset " + buffer_byte_offset); + } + + public boolean checkTag(String tag) throws IOException { + if (tag == null || tag.equals("")) { + throw new IOException("opening tag must be a non-empty string."); + } + + int n = tag.length(); + if ((this.buffer_size - buffer_byte_offset) >= n) { + return (tag).equals(new String(Arrays.copyOfRange(buffer, buffer_byte_offset, buffer_byte_offset+n),"US-ASCII")); + } + else{ + bufferMoreBytes(); + return checkTag(tag); + } + + } + + public void readOpeningTag(String tag) throws IOException { + if (tag == null || tag.equals("")) { + throw new IOException("opening tag must be a non-empty string."); + } + + String openTagString = new String(readBytes(tag.length() + 2), "US-ASCII"); + if (openTagString == null || !openTagString.equals("<"+tag+">")) { + throw new IOException("Could not read opening tag <"+tag+">"); + } + } + + public void readClosingTag(String tag) throws IOException { + if (tag == null || tag.equals("")) { + throw new IOException("closing tag must be a non-empty string."); + } + + String closeTagString = new String(readBytes(tag.length() + 3), "US-ASCII"); + logger.fine("closeTagString: " + closeTagString); + + if (closeTagString == null || !closeTagString.equals("")) { + StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); + String msg = ""; + for (int i = 0; i < 10; i++) { + StackTraceElement stackTraceElement = stackTrace[i]; + msg += stackTraceElement.toString() + "\n"; + } + throw new IOException("Could not read closing tag : " + msg); + } + } + + private byte[] readPrimitiveSectionBytes() throws IOException { + byte[] cached_bytes = null; + + if (buffer_byte_offset > this.buffer_size) { + throw new IOException("Buffer overflow in DataReader."); + } + if (buffer_byte_offset == this.buffer_size) { + // buffer empty; + bufferMoreBytes(); + } + + int cached_offset = buffer_byte_offset; + + while (buffer[buffer_byte_offset] != '<') { + buffer_byte_offset++; + + if (buffer_byte_offset == this.buffer_size) { + logger.fine("reached the end of buffer in readPrimitiveSectionBytes; offset " + buffer_byte_offset); + cached_bytes = mergeCachedBytes(cached_bytes, cached_offset); + bufferMoreBytes(); + cached_offset = 0; + } + } + + return mergeCachedBytes(cached_bytes, cached_offset); + } + + private byte[] mergeCachedBytes(byte[] cached_bytes, int cached_offset) throws IOException { + + byte[] ret_bytes; + if (cached_bytes == null) { + if (buffer_byte_offset - cached_offset < 0) { + throw new IOException("Error merging internal read buffer (no bytes cached to merge)"); + } + // empty section - as in
+ if (buffer_byte_offset - cached_offset == 0) { + return null; + } + + ret_bytes = new byte[buffer_byte_offset - cached_offset]; + System.arraycopy(buffer, cached_offset, ret_bytes, 0, buffer_byte_offset - cached_offset); + } else { + if (cached_offset != 0) { + throw new IOException("Error merging internal read buffer (non-zero cached offset)"); + } + ret_bytes = new byte[cached_bytes.length + buffer_byte_offset]; + System.arraycopy(cached_bytes, 0, ret_bytes, 0, cached_bytes.length); + if (buffer_byte_offset > 0) { + System.arraycopy(buffer, 0, ret_bytes, cached_bytes.length, buffer_byte_offset); + } + } + return ret_bytes; + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java new file mode 100644 index 00000000000..522d00b275d --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReader.java @@ -0,0 +1,1624 @@ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; + +import java.io.*; +import java.util.logging.*; + +import java.util.*; +import java.text.*; + +import org.apache.commons.lang.*; + +import edu.harvard.iq.dataverse.DataTable; +import edu.harvard.iq.dataverse.datavariable.DataVariable; +import edu.harvard.iq.dataverse.datavariable.VariableCategory; + +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; + +/** + * ingest plugin for Stata 13-15 (117-119) DTA file format. A copy and paste from + * + * - v.13/("dta 117"): https://www.stata.com/help.cgi?dta_117 + * + * - v.14/("dta 118"): https://www.stata.com/help.cgi?dta + * + * - v.14/("dta 119"): https://www.stata.com/help.cgi?dta_119 + * + */ +public class NewDTAFileReader extends TabularDataFileReader { + //@Inject + //VariableServiceBean varService; + // static fields, STATA-specific constants, etc. + + // SECTION TAGS: + // + // The new STATA format features XML-like section tags - + //
...
...
+ + // MAIN, TOP-LEVEL FILE SECTION: + private static final String TAG_DTA = "stata_dta"; + + // HEADER SECTION: + private static final String TAG_HEADER = "header"; + private static final String TAG_HEADER_FILEFORMATID = "release"; + private static final String TAG_HEADER_BYTEORDER = "byteorder"; + private static final String TAG_HEADER_VARNUMBER = "K"; + private static final String TAG_HEADER_OBSNUMBER = "N"; + private static final String TAG_HEADER_FILELABEL = "label"; + private static final String TAG_HEADER_TIMESTAMP = "timestamp"; + + // MAP SECTION: + private static final String TAG_MAP = "map"; + + // VARIABLE TYPES SECTION: + private static final String TAG_VARIABLE_TYPES = "variable_types"; + + // VARIABLE NAMES SECTION: + private static final String TAG_VARIABLE_NAMES = "varnames"; + + // VARIABLE SORT ORDER SECTION: + private static final String TAG_SORT_ORDER = "sortlist"; + + // VARIABLE DISPLAY FORMATS: + private static final String TAG_DISPLAY_FORMATS = "formats"; + + // VALUE LABEL FORMAT NAMES: + private static final String TAG_VALUE_LABEL_FORMAT_NAMES = "value_label_names"; + + // VARIABLE LABELS: + private static final String TAG_VARIABLE_LABELS = "variable_labels"; + + // "CHARACTERISTICS": + private static final String TAG_CHARACTERISTICS = "characteristics"; + private static final String TAG_CHARACTERISTICS_SUBSECTION = "ch"; + + // DATA SECTION! + private static final String TAG_DATA = "data"; + + // STRLs SECTION: + private static final String TAG_STRLS = "strls"; + private static final String STRL_GSO_HEAD = "GSO"; + + // VALUE LABELS SECTION: + private static final String TAG_VALUE_LABELS = "value_labels"; + private static final String TAG_VALUE_LABELS_LBL_DEF = "lbl"; + + private static Map STATA_RELEASE_NUMBER = + new HashMap(); + + private static Map> CONSTANT_TABLE = + new LinkedHashMap>(); + + private static Map releaseconstant + = new LinkedHashMap(); + + private static Map byteLengthTable = + new HashMap(); + + private static Map variableTypeTable = + new LinkedHashMap(); + + private static final int[] LENGTH_HEADER = {60, 109}; + private static final int[] LENGTH_LABEL = {32, 81}; + private static final int[] LENGTH_NAME = {9, 33}; + private static final int[] LENGTH_FORMAT_FIELD = {7, 12, 49}; + private static final int[] LENGTH_EXPANSION_FIELD = {0, 2, 4}; + private static final int[] DBL_MV_PWR = {333, 1023}; + + private static final int DTA_MAGIC_NUMBER_LENGTH = 4; + private static final int NVAR_FIELD_LENGTH = 2; + private static final int NOBS_FIELD_LENGTH = 4; + private static final int TIME_STAMP_LENGTH = 18; + private static final int VAR_SORT_FIELD_LENGTH = 2; + private static final int VALUE_LABEL_HEADER_PADDING_LENGTH = 3; + + private static int MISSING_VALUE_BIAS = 26; + + private byte BYTE_MISSING_VALUE = Byte.MAX_VALUE; + private short INT_MISSIG_VALUE = Short.MAX_VALUE; + private int LONG_MISSING_VALUE = Integer.MAX_VALUE; + + // Static initialization: + static { + releaseconstant.put("HEADER", LENGTH_HEADER[1]); + releaseconstant.put("LABEL", LENGTH_LABEL[1]); + releaseconstant.put("NAME", LENGTH_NAME[1]); + releaseconstant.put("FORMAT", LENGTH_FORMAT_FIELD[1]); + releaseconstant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]); + releaseconstant.put("DBL_MV_PWR", DBL_MV_PWR[1]); + + // 1, 2 and 4-byte integers: + byteLengthTable.put("Byte",1); + byteLengthTable.put("Integer",2); + byteLengthTable.put("Long",4); + // 4 and 8-byte floats: + byteLengthTable.put("Float",4); + byteLengthTable.put("Double",8); + // STRLs are defined in their own section, outside of the + // main data. In the section they are referenced + // by 2 x 4 byte values, "(v,o)", 8 bytes total. + byteLengthTable.put("STRL",8); + + variableTypeTable.put(65530,"Byte"); + variableTypeTable.put(65529,"Integer"); + variableTypeTable.put(65528,"Long"); + variableTypeTable.put(65527,"Float"); + variableTypeTable.put(65526,"Double"); + } + + private static String unfVersionNumber = "6"; + + private static final List FLOAT_MISSING_VALUES = Arrays.asList( + 0x1.000p127f, 0x1.001p127f, 0x1.002p127f, 0x1.003p127f, + 0x1.004p127f, 0x1.005p127f, 0x1.006p127f, 0x1.007p127f, + 0x1.008p127f, 0x1.009p127f, 0x1.00ap127f, 0x1.00bp127f, + 0x1.00cp127f, 0x1.00dp127f, 0x1.00ep127f, 0x1.00fp127f, + 0x1.010p127f, 0x1.011p127f, 0x1.012p127f, 0x1.013p127f, + 0x1.014p127f, 0x1.015p127f, 0x1.016p127f, 0x1.017p127f, + 0x1.018p127f, 0x1.019p127f, 0x1.01ap127f); + + private Set FLOAT_MISSING_VALUE_SET = + new HashSet<>(FLOAT_MISSING_VALUES); + + private static final List DOUBLE_MISSING_VALUE_LIST = Arrays.asList( + 0x1.000p1023, 0x1.001p1023, 0x1.002p1023, 0x1.003p1023, 0x1.004p1023, + 0x1.005p1023, 0x1.006p1023, 0x1.007p1023, 0x1.008p1023, 0x1.009p1023, + 0x1.00ap1023, 0x1.00bp1023, 0x1.00cp1023, 0x1.00dp1023, 0x1.00ep1023, + 0x1.00fp1023, 0x1.010p1023, 0x1.011p1023, 0x1.012p1023, 0x1.013p1023, + 0x1.014p1023, 0x1.015p1023, 0x1.016p1023, 0x1.017p1023, 0x1.018p1023, + 0x1.019p1023, 0x1.01ap1023); + + private Set DOUBLE_MISSING_VALUE_SET = + new HashSet<>(DOUBLE_MISSING_VALUE_LIST); + + private static SimpleDateFormat sdf_ymdhmsS = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); // sdf + + private static SimpleDateFormat sdf_ymd = new SimpleDateFormat("yyyy-MM-dd"); // sdf2 + + private static SimpleDateFormat sdf_hms = new SimpleDateFormat("HH:mm:ss"); // stf + + private static SimpleDateFormat sdf_yw = new SimpleDateFormat("yyyy-'W'ww"); + + // stata's calendar + private static Calendar GCO_STATA = new GregorianCalendar(TimeZone.getTimeZone("GMT")); + + private static String[] DATE_TIME_FORMAT = { + "%tc", "%td", "%tw", "%tq", "%tm", "%th", "%ty", + "%d", "%w", "%q", "%m", "h", "%tb" + }; + // New "business calendar format" has been added in Stata 12. -- L.A. + private static String[] DATE_TIME_CATEGORY = { + "time", "date", "date", "date", "date", "date", "date", + "date", "date", "date", "date", "date", "date" + }; + private static Map DATE_TIME_FORMAT_TABLE = new LinkedHashMap(); + + private static long MILLISECCONDS_PER_DAY = 24 * 60 * 60 * 1000L; + + private static long STATA_BIAS_TO_EPOCH; + + static { + + sdf_ymdhmsS.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_yw.setTimeZone(TimeZone.getTimeZone("GMT")); + + // set stata's calendar + GCO_STATA.set(1, 1960);// year + GCO_STATA.set(2, 0); // month + GCO_STATA.set(5, 1);// day of month + GCO_STATA.set(9, 0);// AM(0) or PM(1) + GCO_STATA.set(10, 0);// hh + GCO_STATA.set(12, 0);// mm + GCO_STATA.set(13, 0);// ss + GCO_STATA.set(14, 0); // SS millisecond + + STATA_BIAS_TO_EPOCH = GCO_STATA.getTimeInMillis(); // = -315619200000 + + for (int i=0; i constantTable; + + private Map cachedGSOs; + + private NumberFormat twoDigitFormatter = new DecimalFormat("00"); + + private NumberFormat doubleNumberFormatter = new DecimalFormat(); + + TabularDataIngest ingesteddata = new TabularDataIngest(); + + private int DTAVersion; + + private int headerLength; + + private int dataLabelLength; + + private boolean hasSTRLs = false; + + /* variableTypes is a list of string values representing the type of + * data values *stored* in the file - "byte", "integer", "float", "string", + * etc. We need this information as we're reading the data, to know how + * many bytes to read for every object type and how to convert the binary + * data into the proper Java type. + * It's important to note that these types are *Stata* types - the types + * of the variables on the DVN side may change (see below). + * The variableTypesFinal will describe the data values once they have + * been read and stored in the tab. file. This is an important distinction: + * for example, the time/data values are stored as binary numeric values + * in Stata files, but we'll be storing them as strings in the DVN tabular + * files. + */ + + private String[] variableTypes=null; + + private String[] dateVariableFormats=null; + + private static final String MissingValueForTabDelimitedFile = ""; + + private String[] MIME_TYPE = { + "application/x-stata", + "application/x-stata-13", + "application/x-stata-14", + "application/x-stata-15" + }; + + // Constructor -----------------------------------------------------------// + public NewDTAFileReader(TabularDataFileReaderSpi originator, int DTAVersion) { + super(originator); + + this.DTAVersion = DTAVersion; + STATA_RELEASE_NUMBER.put(DTAVersion, "v." + (DTAVersion-104)); + + CONSTANT_TABLE.put(DTAVersion, releaseconstant); + } + + + /* + * This method configures Stata's release-specific parameters: + */ + private void init() throws IOException { + // + logger.fine("release number=" + DTAVersion); + + BYTE_MISSING_VALUE -= MISSING_VALUE_BIAS; + INT_MISSIG_VALUE -= MISSING_VALUE_BIAS; + LONG_MISSING_VALUE -= MISSING_VALUE_BIAS; + + constantTable = CONSTANT_TABLE.get(DTAVersion); + + headerLength = constantTable.get("HEADER") - DTA_MAGIC_NUMBER_LENGTH; + + dataLabelLength = headerLength - (NVAR_FIELD_LENGTH + + NOBS_FIELD_LENGTH + TIME_STAMP_LENGTH); + logger.fine("data_label_length=" + dataLabelLength); + + logger.fine("constant table to be used:\n" + constantTable); + + doubleNumberFormatter.setGroupingUsed(false); + doubleNumberFormatter.setMaximumFractionDigits(340); + } + + @Override + public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException { + logger.fine("NewDTAFileReader: read() start"); + + // shit ton of diagnostics (still) needed here!! -- L.A. + if (dataFile != null) { + throw new IOException("this plugin does not support external raw data files"); + } + + DataReader dataReader; + + init(); + dataReader = new DataReader(stream); + dataReader.readOpeningTag(TAG_DTA); + readHeader(dataReader); + readMap(dataReader); + readVariableTypes(dataReader); + readVariableNames(dataReader); + readSortOrder(dataReader); + readDisplayFormats(dataReader); + readValueLabelFormatNames(dataReader); + readVariableLabels(dataReader); + // "characteristics" - STATA-proprietary information + // (we are skipping it) + readCharacteristics(dataReader); + readData(dataReader); + + // (potentially) large, (potentially) non-ASCII character strings + // saved outside the section, and referenced + // in the data with (v,o) notation - docs have more info + readSTRLs(dataReader); + readValueLabels(dataReader); + dataReader.readClosingTag(TAG_DTA); + + ingesteddata.setDataTable(dataTable); + + logger.fine("NewDTAFileReader: read() end."); + return ingesteddata; + } + + private void readHeader(DataReader dataReader) throws IOException { + logger.fine("readHeader(): start"); + + if (dataReader == null) { + throw new IllegalArgumentException("stream == null!"); + } + + logger.fine("reading the version header."); + + dataReader.readOpeningTag(TAG_HEADER); + String dtaVersionTag = dataReader.readPrimitiveStringSection(TAG_HEADER_FILEFORMATID, 3); + + if (!("117".equals(dtaVersionTag)||"118".equals(dtaVersionTag)||"119".equals(dtaVersionTag))) { + throw new IOException("Unexpected version tag found: " + dtaVersionTag + "; expected value: 117-119."); + } + + String byteOrderTag = dataReader.readPrimitiveStringSection(TAG_HEADER_BYTEORDER); + + logger.fine("byte order: "+byteOrderTag); + + dataReader.setLSF("LSF".equals(byteOrderTag)); + + long varNumber = dataReader.readIntegerSection(TAG_HEADER_VARNUMBER, DTAVersion == 119? 4: 2); + logger.fine("number of variables: " + varNumber); + + /** + * 5.1.4 N, # of observations + * + * N, the number of observations stored in the dataset, is recorded as + * a 4 or 8-byte unsigned integer field recorded according to byteorder. + */ + long obsNumber = dataReader.readIntegerSection(TAG_HEADER_OBSNUMBER, DTAVersion == 117? 4: 8); + logger.fine("number of observations: " + obsNumber); + + dataTable.setVarQuantity(varNumber); + dataTable.setCaseQuantity(obsNumber); + + dataTable.setOriginalFileFormat(MIME_TYPE[0]); + + dataTable.setOriginalFormatVersion("STATA " + (DTAVersion-104)); + dataTable.setUnf("UNF:pending"); + + // The word "dataset" below is used in its STATA parlance meaning, + // i.e., this is a label that describes the datafile. + String datasetLabel; + if (DTAVersion==117){ + datasetLabel = dataReader.readDefinedStringSection(TAG_HEADER_FILELABEL, 80); + }else{ + datasetLabel = dataReader.readLabelSection(TAG_HEADER_FILELABEL, 320); + } + logger.fine("Stata \"dataset\" label: " + datasetLabel); + + // TODO: + // We are not doing anything with this label. But maybe we should? + // We could add a "description" field to the Dataverse DataTable object, + // and maybe put it there. Alternatively we could add some other mechanism for + // the ingest plugin to pass this label back to Dataverse, and maybe + // appending it to the DataFile description in the FileMetadata object. + // Probably not the highest priority. + String datasetTimeStamp = dataReader.readDefinedStringSection(TAG_HEADER_TIMESTAMP, 17); + logger.fine("dataset time stamp: " + datasetTimeStamp); + + if (datasetTimeStamp == null + || (datasetTimeStamp.length() > 0 && datasetTimeStamp.length() < 17)) { + throw new IOException("unexpected/invalid length of the time stamp in the NewDTA header."); + } else { + // If we decide that we actually want/need to use this time stamp for any + // practical purposes (again, we could add it to the descriptive + // metadata somehow), we should probably validate it against dd Mon yyyy hh:mm. + } + + dataReader.readClosingTag("header"); + logger.fine("readHeader(): end"); + } + + private void readMap(DataReader reader) throws IOException { + logger.fine("Map section; at offset " + reader.getByteOffset()); + reader.readOpeningTag(TAG_MAP); + + dtaMap = new DTADataMap(); + + long dta_offset_stata_data = reader.readULong(); + logger.fine("dta_offset_stata_data: " + dta_offset_stata_data); + dtaMap.setOffset_head(dta_offset_stata_data); + long dta_offset_map = reader.readULong(); + logger.fine("dta_offset_map: " + dta_offset_map); + dtaMap.setOffset_map(dta_offset_map); + long dta_offset_variable_types = reader.readULong(); + logger.fine("dta_offset_variable_types: " + dta_offset_variable_types); + dtaMap.setOffset_types(dta_offset_variable_types); + long dta_offset_varnames = reader.readULong(); + logger.fine("dta_offset_varnames: " + dta_offset_varnames); + dtaMap.setOffset_varnames(dta_offset_varnames); + long dta_offset_sortlist = reader.readULong(); + logger.fine("dta_offset_sortlist: " + dta_offset_sortlist); + dtaMap.setOffset_srtlist(dta_offset_sortlist); + long dta_offset_formats = reader.readULong(); + logger.fine("dta_offset_formats: " + dta_offset_formats); + dtaMap.setOffset_fmts(dta_offset_formats); + long dta_offset_value_label_names = reader.readULong(); + logger.fine("dta_offset_value_label_names: " + dta_offset_value_label_names); + dtaMap.setOffset_vlblnames(dta_offset_value_label_names); + long dta_offset_variable_labels = reader.readULong(); + logger.fine("dta_offset_variable_labels: " + dta_offset_variable_labels); + dtaMap.setOffset_varlabs(dta_offset_variable_labels); + long dta_offset_characteristics = reader.readULong(); + logger.fine("dta_offset_characteristics: " + dta_offset_characteristics); + dtaMap.setOffset_characteristics(dta_offset_characteristics); + long dta_offset_data = reader.readULong(); + logger.fine("dta_offset_data: " + dta_offset_data); + dtaMap.setOffset_data(dta_offset_data); + long dta_offset_strls = reader.readULong(); + logger.fine("dta_offset_strls: " + dta_offset_strls); + dtaMap.setOffset_strls(dta_offset_strls); + long dta_offset_value_labels = reader.readULong(); + logger.fine("dta_offset_value_labels: " + dta_offset_value_labels); + dtaMap.setOffset_vallabs(dta_offset_value_labels); + long dta_offset_data_close = reader.readULong(); + logger.fine("dta_offset_data_close: " + dta_offset_data_close); + dtaMap.setOffset_data_close(dta_offset_data_close); + long dta_offset_eof = reader.readULong(); + logger.fine("dta_offset_eof: " + dta_offset_eof); + dtaMap.setOffset_eof(dta_offset_eof); + + reader.readClosingTag(TAG_MAP); + + } + + /* + * Variable type information is stored in the ... + * section, as number_of_variables * 2 byte values. + * Consult the Stata documentation for the type definition codes. + */ + private void readVariableTypes(DataReader reader) throws IOException { + logger.fine("Type section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_types()); + reader.readOpeningTag(TAG_VARIABLE_TYPES); + + List variableList = new ArrayList<>(); + // setup variableTypeList + variableTypes = new String[dataTable.getVarQuantity().intValue()]; + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + int type = reader.readUShort(); + logger.fine("variable " + i + ": type=" + type); + DataVariable dv = new DataVariable(); + + dv.setInvalidRanges(new ArrayList<>()); + dv.setSummaryStatistics(new ArrayList<>()); + dv.setCategories(new ArrayList<>()); + + dv.setUnf("UNF:pending"); + dv.setFileOrder(i); + dv.setDataTable(dataTable); + + variableTypes[i] = configureVariableType(dv, type); + + variableList.add(dv); + + } + + reader.readClosingTag(TAG_VARIABLE_TYPES); + dataTable.setDataVariables(variableList); + + } + + private String configureVariableType(DataVariable dv, int type) throws IOException { + String typeLabel = null; + + if (variableTypeTable.containsKey(type)) { + typeLabel = variableTypeTable.get(type); + + dv.setTypeNumeric(); + switch (typeLabel) { + case "Byte": + case "Integer": + case "Long": + // these are treated as discrete: + dv.setIntervalDiscrete(); + break; + case "Float": + case "Double": + // these are treated as contiuous: + dv.setIntervalContinuous(); + break; + default: + throw new IOException("Unrecognized type label: " + typeLabel + " for Stata type value (short) " + type + "."); + } + + } else { + // String: + // + // 32768 - flexible length STRL; + // 1 ... 2045 - fixed-length STRF; + + if (type == 32768) { + typeLabel = "STRL"; + hasSTRLs = true; + + } else if (type > 0 && type < 2046) { + typeLabel = "STR" + type; + } else { + throw new IOException("unknown variable type value encountered: " + type); + } + + dv.setTypeCharacter(); + dv.setIntervalDiscrete(); + } + + return typeLabel; + + } + + /* + * Variable Names are stored as number_of_variables * 33 byte long + * (zero-padded and zero-terminated) character vectors. + */ + private void readVariableNames(DataReader reader) throws IOException { + logger.fine("Variable names section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_varnames()); + reader.readOpeningTag(TAG_VARIABLE_NAMES); + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + String variableName = reader.readString(DTAVersion == 117? 33: 129); + logger.fine("variable " + i + ": name=" + variableName); + if ((variableName != null) && (!variableName.equals(""))) { + dataTable.getDataVariables().get(i).setName(variableName); + } else { + // TODO: Is this condition even possible? + // Should we be throwing an exception if it's encountered? + } + } + + reader.readClosingTag(TAG_VARIABLE_NAMES); + } + + private void readSortOrder(DataReader reader) throws IOException { + logger.fine("Sort Order section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_srtlist()); + reader.readOpeningTag(TAG_SORT_ORDER); + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + long order = reader.readULong(DTAVersion == 119? 4: 2); + logger.fine("variable " + i + ": sort order=" + order); + // We don't use this variable sort order at all. + } + + // Important! + // The SORT ORDER section (5.5 in the doc) always contains + // number_of_variables + 1 2 or 4 byte integers depending on version! + long terminatingShort = reader.readULong(DTAVersion == 119? 4: 2); + reader.readClosingTag(TAG_SORT_ORDER); + } + + // Variable Formats are used exclusively for time and date variables. + private void readDisplayFormats(DataReader reader) throws IOException { + logger.fine("Formats section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_fmts()); + reader.readOpeningTag(TAG_DISPLAY_FORMATS); + dateVariableFormats = new String[dataTable.getVarQuantity().intValue()]; + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + String variableFormat = reader.readString(DTAVersion == 117? 49: 57); + logger.fine("variable " + i + ": displayFormat=" + variableFormat); + + String variableFormatKey; + if (variableFormat.startsWith("%t")) { + variableFormatKey = variableFormat.substring(0, 3); + } else { + variableFormatKey = variableFormat.substring(0, 2); + } + logger.fine(i + " th variableFormatKey=" + variableFormatKey); + + /* + * Now, let's check if this format is a known time or date format. + * If so, note that this changes the storage type of the variable! + * i.e., times and dates are stored as binary numeric values, but on + * the DVN side/in the tab files they will become strings. + */ + if (DATE_TIME_FORMAT_TABLE.containsKey(variableFormatKey)) { + dateVariableFormats[i] = variableFormat; + dataTable.getDataVariables().get(i).setFormatCategory(DATE_TIME_FORMAT_TABLE.get(variableFormatKey)); + logger.fine(i + "th var: category=" + + DATE_TIME_FORMAT_TABLE.get(variableFormatKey)); + dataTable.getDataVariables().get(i).setTypeCharacter(); + dataTable.getDataVariables().get(i).setIntervalDiscrete(); + } + } + + reader.readClosingTag(TAG_DISPLAY_FORMATS); + } + + /* + * Another fixed-field section + */ + private void readValueLabelFormatNames(DataReader reader) throws IOException { + logger.fine("Category valuable section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_vlblnames()); + reader.readOpeningTag(TAG_VALUE_LABEL_FORMAT_NAMES); + + valueLabelsLookupTable = new String[dataTable.getVarQuantity().intValue()]; + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + String valueLabelFormat = reader.readString(DTAVersion == 117? 33: 129); + logger.fine("variable " + i + ": value label format=" + valueLabelFormat); + if ((valueLabelFormat != null) && (!valueLabelFormat.equals(""))) { + valueLabelsLookupTable[i] = valueLabelFormat; + } + } + + reader.readClosingTag(TAG_VALUE_LABEL_FORMAT_NAMES); + + } + + /* + * Another fixed-field section + */ + private void readVariableLabels(DataReader reader) throws IOException { + logger.fine("Variable labels section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_varlabs()); + reader.readOpeningTag(TAG_VARIABLE_LABELS); + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + String variableLabel = reader.readString(DTAVersion == 117? 81: 321); + logger.fine("variable " + i + ": label=" + variableLabel); + if ((variableLabel != null) && (!variableLabel.equals(""))) { + dataTable.getDataVariables().get(i).setLabel(variableLabel); + } + } + + reader.readClosingTag(TAG_VARIABLE_LABELS); + } + + private void readCharacteristics(DataReader reader) throws IOException { + logger.fine("Characteristics section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_characteristics()); + reader.readOpeningTag(TAG_CHARACTERISTICS); + + reader.skipDefinedSections(TAG_CHARACTERISTICS_SUBSECTION); + + reader.readClosingTag(TAG_CHARACTERISTICS); + + } + + private void readData(DataReader reader) throws IOException { + logger.fine("Data section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_data()); + logger.fine("readData(): start"); + reader.readOpeningTag(TAG_DATA); + + int nvar = dataTable.getVarQuantity().intValue(); + int nobs = dataTable.getCaseQuantity().intValue(); + + int[] variableByteLengths = getVariableByteLengths(variableTypes); + int bytes_per_row = calculateBytesPerRow(variableByteLengths); + + logger.fine("data dimensions[observations x variables] = (" + nobs + "x" + nvar + ")"); + logger.fine("bytes per row=" + bytes_per_row + " bytes"); + logger.fine("variableTypes=" + Arrays.deepToString(variableTypes)); + + // create a File object to save the tab-delimited data file + File tabDelimitedDataFile = File.createTempFile("tempTabfile.", ".tab"); + + // save the temp tab-delimited file in the return ingest object: + ingesteddata.setTabDelimitedFile(tabDelimitedDataFile); + + FileOutputStream fileOutTab = new FileOutputStream(tabDelimitedDataFile); + PrintWriter pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); + + logger.fine("Beginning to read data stream."); + + for (int i = 0; i < nobs; i++) { + Object[] dataRow = new Object[nvar]; + + // TODO: + // maybe intercept any potential exceptions here, and add more + // diagnostic info, before re-throwing... + int byte_offset = 0; + for (int columnCounter = 0; columnCounter < nvar; columnCounter++) { + + String varType = variableTypes[columnCounter]; + + // 4.0 Check if this is a time/date variable: + boolean isDateTimeDatum = false; + String formatCategory = dataTable.getDataVariables().get(columnCounter).getFormatCategory(); + if (formatCategory != null && (formatCategory.equals("time") || formatCategory.equals("date"))) { + isDateTimeDatum = true; + } + + String variableFormat = dateVariableFormats[columnCounter]; + + if (varType == null || varType.equals("")) { + throw new IOException("Undefined variable type encountered in readData()"); + } + + if (varType.equals("Byte")) { // signed + byte byte_datum = reader.readByte(); + + logger.fine(i + "-th row " + columnCounter + + "=th column byte =" + byte_datum); + if (byte_datum >= BYTE_MISSING_VALUE) { + logger.fine(i + "-th row " + columnCounter + + "=th column byte MV=" + byte_datum); + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + } else { + dataRow[columnCounter] = byte_datum; + logger.fine(i + "-th row " + columnCounter + + "-th column byte value=" + byte_datum); + } + + byte_offset++; + } else if (varType.equals("Integer")) { // signed + short short_datum = (short) reader.readShort(); + + logger.fine(i + "-th row " + columnCounter + + "=th column stata int =" + short_datum); + + if (short_datum >= INT_MISSIG_VALUE) { + logger.fine(i + "-th row " + columnCounter + + "=th column stata long missing value=" + short_datum); + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + } else { + + if (isDateTimeDatum) { + + DecodedDateTime ddt = decodeDateTimeData("short", variableFormat, Short.toString(short_datum)); + logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); + dataRow[columnCounter] = ddt.decodedDateTime; + dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); + + } else { + dataRow[columnCounter] = short_datum; + logger.fine(i + "-th row " + columnCounter + + "-th column \"integer\" value=" + short_datum); + } + } + byte_offset += 2; + } else if (varType.equals("Long")) { // stata-Long = java's int: 4 byte + int int_datum = reader.readInt(); + + if (int_datum >= LONG_MISSING_VALUE) { + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + } else { + if (isDateTimeDatum) { + DecodedDateTime ddt = decodeDateTimeData("int", variableFormat, Integer.toString(int_datum)); + logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); + dataRow[columnCounter] = ddt.decodedDateTime; + dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); + + } else { + dataRow[columnCounter] = int_datum; + logger.fine(i + "-th row " + columnCounter + + "-th column \"long\" value=" + int_datum); + } + + } + byte_offset += 4; + } else if (varType.equals("Float")) { // STATA float 4-byte + + float float_datum = reader.readFloat(); + + logger.fine(i + "-th row " + columnCounter + + "=th column float =" + float_datum); + if (FLOAT_MISSING_VALUE_SET.contains(float_datum)) { + logger.fine(i + "-th row " + columnCounter + + "=th column float missing value=" + float_datum); + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + + } else { + + if (isDateTimeDatum) { + DecodedDateTime ddt = decodeDateTimeData("float", variableFormat, doubleNumberFormatter.format(float_datum)); + logger.fine(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); + dataRow[columnCounter] = ddt.decodedDateTime; + dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); + } else { + dataRow[columnCounter] = float_datum; + logger.fine(i + "-th row " + columnCounter + + "=th column float value:" + float_datum); + // This may be temporary - but for now (as in, while I'm testing + // 4.0 ingest against 3.* ingest, I need to be able to tell if a + // floating point value was a single, or double float in the + // original STATA file: -- L.A. Jul. 2014 + dataTable.getDataVariables().get(columnCounter).setFormat("float"); + // ? + } + + } + byte_offset += 4; + } else if (varType.equals("Double")) { // STATA double 8 bytes + + double double_datum = reader.readDouble(); + if (DOUBLE_MISSING_VALUE_SET.contains(double_datum)) { + logger.finer(i + "-th row " + columnCounter + + "=th column double missing value=" + double_datum); + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + } else { + + if (isDateTimeDatum) { + DecodedDateTime ddt = decodeDateTimeData("double", variableFormat, doubleNumberFormatter.format(double_datum)); + logger.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format); + dataRow[columnCounter] = ddt.decodedDateTime; + dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format); + } else { + logger.fine(i + "-th row " + columnCounter + + "=th column double value:" + double_datum); //doubleNumberFormatter.format(double_datum)); + + dataRow[columnCounter] = double_datum; //doubleNumberFormatter.format(double_datum); + } + + } + byte_offset += 8; + } else if (varType.matches("^STR[1-9][0-9]*")) { + // String case + int strVarLength = variableByteLengths[columnCounter]; + logger.fine(i + "-th row " + columnCounter + + "=th column is a string (" + strVarLength + " bytes)"); + // In STATA13+, STRF strings *MUST* + // be limited to ASCII. UTF8 strings can be stored as + // STRLs. + String string_datum = reader.readString(strVarLength); + if (string_datum.equals("")) { + + logger.fine(i + "-th row " + columnCounter + + "=th column string missing value=" + string_datum); + + /* Note: + * In Stata, an empty string ("") in a String vector is + * the notation for a missing value. + * So in the resulting tab file it should be stored as such, + * and not as an empty string (that would be "\"\""). + * (This of course means that it's simply not possible + * to store actual empty strings in Stata) + */ + dataRow[columnCounter] = MissingValueForTabDelimitedFile; + } else { + /* + * Some special characters, like new lines and tabs need to + * be escaped - otherwise they will break our TAB file + * structure! + */ + + dataRow[columnCounter] = escapeCharacterString(string_datum); + } + byte_offset += strVarLength; + } else if (varType.equals("STRL")) { + logger.fine("STRL encountered."); + + if (cachedGSOs == null) { + cachedGSOs = new LinkedHashMap<>(); + } + + // Reading the (v,o) pair: + long v; + long o; + + if(DTAVersion == 117){ + v = reader.readUInt(); + byte_offset += 4; + o = reader.readUInt(); + byte_offset += 4; + } else { + v = reader.readUShort(); + byte_offset += 2; + o = reader.readULong(6); + byte_offset += 6; + } + // create v,o pair; save, for now: + String voPair = v + "," + o; + dataRow[columnCounter] = voPair; + + // TODO: + // would it make sense to validate v and o here? + // Making sure v <= varNum and o < numbObs; + // or, if o == numObs, v <= columnCounter; + // -- per the Stata 13+ spec... + if (!(v == columnCounter + 1 && o == i + 1)) { + if (!cachedGSOs.containsKey(voPair)) { + cachedGSOs.put(voPair, ""); + // this means we need to cache this GSO, when + // we read the STRLS section later on. + } + } + + } else { + logger.warning("unknown variable type found: " + varType); + String errorMessage + = "unknown variable type encounted when reading data section: " + varType; + throw new IOException(errorMessage); + + } + } + + if (byte_offset != bytes_per_row) { + throw new IOException("Unexpected number of bytes read for data row " + i + "; " + bytes_per_row + " expected, " + byte_offset + " read."); + } + + // Dump the row of data to the tab-delimited file: + pwout.println(StringUtils.join(dataRow, "\t")); + + logger.fine("finished reading " + i + "-th row"); + + } // for (rows) + + pwout.close(); + + reader.readClosingTag(TAG_DATA); + logger.fine("NewDTA Ingest: readData(): end."); + + } + + /* + * STRLs: + */ + private void readSTRLs(DataReader reader) throws IOException { + logger.fine("STRLs section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_strls()); + + if (hasSTRLs) { + reader.readOpeningTag(TAG_STRLS); + + File intermediateTabFile = ingesteddata.getTabDelimitedFile(); + FileInputStream fileInTab = new FileInputStream(intermediateTabFile); + + Scanner scanner = new Scanner(fileInTab); + scanner.useDelimiter("\\n"); + + File finalTabFile = File.createTempFile("finalTabfile.", ".tab"); + FileOutputStream fileOutTab = new FileOutputStream(finalTabFile); + PrintWriter pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); + + logger.fine("Setting the tab-delimited file to " + finalTabFile.getName()); + ingesteddata.setTabDelimitedFile(finalTabFile); + + int nvar = dataTable.getVarQuantity().intValue(); + int nobs = dataTable.getCaseQuantity().intValue(); + + String[] line; + + for (int obsindex = 0; obsindex < nobs; obsindex++) { + if (scanner.hasNext()) { + line = (scanner.next()).split("\t", -1); + + for (int varindex = 0; varindex < nvar; varindex++) { + if ("STRL".equals(variableTypes[varindex])) { + // this is a STRL; needs to be re-processed: + + String voPair = line[varindex]; + long v; + long o; + if (voPair == null) { + throw new IOException("Failed to read an intermediate v,o Pair for variable " + + varindex + ", observation " + obsindex); + } + + if ("0,0".equals(voPair)) { + // This is a code for an empty string - ""; + // doesn't need to be defined or looked up. + + line[varindex] = "\"\""; + } else { + String[] voTokens = voPair.split(",", 2); + + try { + v = new Long(voTokens[0]); + o = new Long(voTokens[1]); + } catch (NumberFormatException nfex) { + throw new IOException("Illegal v,o value: " + voPair + " for variable " + + varindex + ", observation " + obsindex); + } + + if (v == varindex + 1 && o == obsindex + 1) { + // This v,o must be defined in the STRLs section: + line[varindex] = readGSO(reader, v, o); + if (line[varindex] == null) { + throw new IOException("Failed to read GSO value for " + voPair); + } + + } else { + // This one must have been cached already: + if (cachedGSOs.get(voPair) != null + && !cachedGSOs.get(voPair).equals("")) { + line[varindex] = cachedGSOs.get(voPair); + } else { + throw new IOException("GSO string unavailable for v,o value " + voPair); + } + } + } + } + } + // Dump the row of data to the tab-delimited file: + pwout.println(StringUtils.join(line, "\t")); + } + } + + scanner.close(); + pwout.close(); + + reader.readClosingTag(TAG_STRLS); + } else { + // If this data file doesn't use STRLs, we can just skip + // this section, and assume that we are done with the + // tabular data file. + reader.readPrimitiveSection(TAG_STRLS); + } + + //reader.readClosingTag(TAG_STRLS); + } + + private String readGSO(DataReader reader, long v, long o) throws IOException { + if (!reader.checkTag(STRL_GSO_HEAD)) { + return null; + } + + // Skipping the GSO header - fixed string "GSO": + reader.readBytes(STRL_GSO_HEAD.length()); + + // Reading the stored (v,o) pair: + long vStored = reader.readUInt(); + long oStored = reader.readULong(DTAVersion == 117? 4: 8); + + String voPair = v + "," + o; + + if (vStored != v || oStored != o) { + throw new IOException("GSO reading mismatch: expected v,o pair: " + + voPair + ", found: " + vStored + "," + oStored); + } + + short type = reader.readUByte(); + boolean binary = false; + + if (type == 129) { + logger.fine("STRL TYPE: binary"); + binary = true; + } else if (type == 130) { + logger.fine("STRL TYPE: ascii"); + } else { + logger.warning("WARNING: unknown STRL type: " + type); + } + + long length = reader.readUInt(); + + logger.fine("Advertised length of the STRL: " + length); + + // TODO: + // length can technically be 0 < length < 2^^32; + // but Java arrays are only [int], i.e., can only have < 2^^31 + // elements; readBytes() allocates and returns a byte[] array. + // so I should probably check the value of length - if it + // can fit into a signed int; not that it's likely to happen + // in real life. Still, should we throw an exception here, if + // this length is > 2^^31? + byte[] contents = reader.readBytes((int) length); + + String gsoString; + if (binary) { + gsoString = new String(contents, "utf8"); + } else { + gsoString = new String(contents, 0, (int) length - 1, "US-ASCII"); + } + + logger.fine("GSO " + v + "," + o + ": " + gsoString); + + String escapedGsoString = escapeCharacterString(gsoString); + + if (cachedGSOs.containsKey(voPair)) { + // We need to cache this GSO: + if (!"".equals(cachedGSOs.get(voPair))) { + throw new IOException("Multiple GSO definitions for v,o " + voPair); + } + cachedGSOs.put(voPair, escapedGsoString); + } + + return escapedGsoString; + } + + private void readValueLabels(DataReader reader) throws IOException { + logger.fine("Value Labels section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_vallabs()); + logger.fine("readValueLabels(): start."); + + reader.readOpeningTag(TAG_VALUE_LABELS); + + while (reader.checkTag("<" + TAG_VALUE_LABELS_LBL_DEF + ">")) { + reader.readOpeningTag(TAG_VALUE_LABELS_LBL_DEF); + long label_table_length = reader.readUInt(); + + String label_table_name = reader.readString(DTAVersion == 117? 33: 129); + + reader.readBytes(3); + + // read the value_label_table that follows. + // should be label_table_length. + int number_of_categories = (int) reader.readUInt(); + long text_length = reader.readUInt(); + + long value_category_offset = 8; + + long[] value_label_offsets = new long[number_of_categories]; + long[] value_label_offsets_sorted = null; + long[] category_values = new long[number_of_categories]; + String[] category_value_labels = new String[number_of_categories]; + + boolean alreadySorted = true; + + for (int i = 0; i < number_of_categories; i++) { + value_label_offsets[i] = reader.readUInt(); + logger.fine("offset " + i + ": " + value_label_offsets[i]); + value_category_offset += 4; + if (i > 0 && value_label_offsets[i] < value_label_offsets[i-1]) { + alreadySorted = false; + } + } + + if (!alreadySorted) { + //value_label_offsets_sorted = new long[number_of_categories]; + value_label_offsets_sorted = Arrays.copyOf(value_label_offsets, number_of_categories); + Arrays.sort(value_label_offsets_sorted); + } + + for (int i = 0; i < number_of_categories; i++) { + category_values[i] = reader.readInt(); + value_category_offset += 4; + } + + int total_label_bytes = 0; + + long label_offset; + long label_end; + int label_length; + + // Read the remaining bytes in this section. + // This byte[] array will contain all the value labels for the + // variable. Each is terminated by the binary zero byte; so we + // can read the bytes for each label at the defined offset until + // we encounter \000. Or we can rely on the (sorted) list of offsets + // to determine where each label ends (implemented below). + byte[] labelBytes = null; + if((int)text_length != 0) { //If length is 0 we don't need to read any bytes + labelBytes = reader.readBytes((int)text_length); + } + + for (int i = 0; i < number_of_categories; i++) { + label_offset = value_label_offsets[i]; + + if (value_label_offsets_sorted == null) { + label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length; + } else { + int sortedPos = Arrays.binarySearch(value_label_offsets_sorted, label_offset); + label_end = sortedPos < number_of_categories - 1 ? value_label_offsets_sorted[sortedPos + 1] : text_length; + } + label_length = (int)(label_end - label_offset); + + category_value_labels[i] = new String(Arrays.copyOfRange(labelBytes, (int)label_offset, (int)label_end-1), "US-ASCII"); + total_label_bytes += label_length; + } + + value_category_offset += total_label_bytes; + + logger.fine("text_length: " + text_length); + logger.fine("total_label_bytes: " + total_label_bytes); + if (total_label_bytes != text_length) { + throw new IOException(""); + } + + if (value_category_offset != label_table_length) { + throw new IOException(""); + } + reader.readClosingTag(TAG_VALUE_LABELS_LBL_DEF); + + // Find the variables that may be linking to this Category Values Table + // and create VariableCategory objects for the corresponding + // DataVariables: + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + if (label_table_name.equals(valueLabelsLookupTable[i])) { + logger.fine("cross-linking value label table for " + label_table_name); + + for (int j = 0; j < number_of_categories; j++) { + VariableCategory cat = new VariableCategory(); + + long cat_value = category_values[j]; + String cat_label = category_value_labels[j]; + + cat.setValue("" + cat_value); + cat.setLabel(cat_label); + + /* cross-link the variable and category to each other: */ + cat.setDataVariable(dataTable.getDataVariables().get(i)); + dataTable.getDataVariables().get(i).getCategories().add(cat); + } + } + } + } + + reader.readClosingTag(TAG_VALUE_LABELS); + logger.fine("readValueLabels(): end."); + + } + + /* + * Helper methods for decoding data: + */ + private int calculateBytesPerRow(int[] variableByteLengths) throws IOException { + if (variableByteLengths == null || variableByteLengths.length != dataTable.getVarQuantity()) { + throw new IOException(""); + } + int bytes_per_row = 0; + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + if (variableByteLengths[i] < 1) { + throw new IOException(""); + } + bytes_per_row += variableByteLengths[i]; + } + + return bytes_per_row; + } + + private int[] getVariableByteLengths(String[] variableTypes) throws IOException { + if (variableTypes == null || variableTypes.length != dataTable.getVarQuantity()) { + throw new IOException(""); + } + + int[] variableByteLengths = new int[dataTable.getVarQuantity().intValue()]; + + for (int i = 0; i < dataTable.getVarQuantity(); i++) { + variableByteLengths[i] = getVariableByteLength(variableTypes[i]); + } + + return variableByteLengths; + } + + private int getVariableByteLength(String variableType) throws IOException { + int byte_length = 0; + + if (variableType == null || variableType.equals("")) { + throw new IOException(""); + } + if (byteLengthTable.containsKey(variableType)) { + return byteLengthTable.get(variableType); + } + + if (variableType.matches("^STR[1-9][0-9]*")) { + String stringLengthToken = variableType.substring(3); + Integer stringLength; + try { + stringLength = new Integer(stringLengthToken); + } catch (NumberFormatException nfe) { + stringLength = null; + } + if (stringLength == null || stringLength < 1 || stringLength > 2045) { + throw new IOException("Invalid STRF encountered: " + variableType); + } + return stringLength; + } + + throw new IOException("Unknown/invalid variable type: " + variableType); + } + + private class DecodedDateTime { + + String format; + String decodedDateTime; + } + + private DecodedDateTime decodeDateTimeData(String storageType, String FormatType, String rawDatum) throws IOException { + + logger.fine("(storageType, FormatType, rawDatum)=(" + + storageType + ", " + FormatType + ", " + rawDatum + ")"); + /* + * Historical note: + pseudofunctions, td(), tw(), tm(), tq(), and th() + used to be called d(), w(), m(), q(), and h(). + Those names still work but are considered anachronisms. + + */ + + long milliSeconds; + String decodedDateTime; + String format; + + if (FormatType.matches("^%tc.*")) { + // tc is a relatively new format + // datum is millisecond-wise + milliSeconds = Math.round(new Double(rawDatum)) + STATA_BIAS_TO_EPOCH; + decodedDateTime = sdf_ymdhmsS.format(new Date(milliSeconds)); + format = sdf_ymdhmsS.toPattern(); + logger.fine("tc: result=" + decodedDateTime + ", format = " + format); + + } else if (FormatType.matches("^%t?d.*")) { + milliSeconds = Math.round(new Double(rawDatum)) * MILLISECCONDS_PER_DAY + STATA_BIAS_TO_EPOCH; + logger.fine("milliSeconds=" + milliSeconds); + + decodedDateTime = sdf_ymd.format(new Date(milliSeconds)); + format = sdf_ymd.toPattern(); + logger.fine("td:" + decodedDateTime + ", format = " + format); + + } else if (FormatType.matches("^%t?w.*")) { + + long weekYears = Math.round(new Double(rawDatum)); + long left = Math.abs(weekYears) % 52L; + long years; + if (weekYears < 0L) { + left = 52L - left; + if (left == 52L) { + left = 0L; + } + //out.println("left="+left); + years = (Math.abs(weekYears) - 1) / 52L + 1L; + years *= -1L; + } else { + years = weekYears / 52L; + } + + String yearString = Long.toString(1960L + years); + String dayInYearString = new DecimalFormat("000").format((left * 7) + 1); + String yearDayInYearString = yearString + "-" + dayInYearString; + + Date tempDate = null; + try { + tempDate = new SimpleDateFormat("yyyy-DDD").parse(yearDayInYearString); + } catch (ParseException ex) { + throw new IOException(ex); + } + + decodedDateTime = sdf_ymd.format(tempDate.getTime()); + format = sdf_ymd.toPattern(); + + } else if (FormatType.matches("^%t?m.*")) { + // month + long monthYears = Math.round(new Double(rawDatum)); + long left = Math.abs(monthYears) % 12L; + long years; + if (monthYears < 0L) { + left = 12L - left; + //out.println("left="+left); + years = (Math.abs(monthYears) - 1) / 12L + 1L; + years *= -1L; + } else { + years = monthYears / 12L; + } + + if (left == 12L) { + left = 0L; + } + Long monthdata = (left + 1); + String month = "-" + twoDigitFormatter.format(monthdata) + "-01"; + long year = 1960L + years; + String monthYear = year + month; + logger.fine("rawDatum=" + rawDatum + ": monthYear=" + monthYear); + + decodedDateTime = monthYear; + format = "yyyy-MM-dd"; + logger.fine("tm:" + decodedDateTime + ", format:" + format); + + } else if (FormatType.matches("^%t?q.*")) { + // quarter + long quarterYears = Math.round(new Double(rawDatum)); + long left = Math.abs(quarterYears) % 4L; + long years; + if (quarterYears < 0L) { + left = 4L - left; + //out.println("left="+left); + years = (Math.abs(quarterYears) - 1) / 4L + 1L; + years *= -1L; + } else { + years = quarterYears / 4L; + } + + String quarter = null; + + if ((left == 0L) || (left == 4L)) { + //quarter ="q1"; // + quarter = "-01-01"; + } else if (left == 1L) { + //quarter = "q2"; // + quarter = "-04-01"; + } else if (left == 2L) { + //quarter = "q3"; // + quarter = "-07-01"; + } else if (left == 3L) { + //quarter = "q4"; // + quarter = "-11-01"; + } + + long year = 1960L + years; + String quarterYear = Long.toString(year) + quarter; + logger.fine("rawDatum=" + rawDatum + ": quarterYear=" + quarterYear); + + decodedDateTime = quarterYear; + format = "yyyy-MM-dd"; + logger.fine("tq:" + decodedDateTime + ", format:" + format); + + } else if (FormatType.matches("^%t?h.*")) { + // half year + // odd number:2nd half + // even number: 1st half + + long halvesYears = Math.round(new Double(rawDatum)); + long left = Math.abs(halvesYears) % 2L; + long years; + if (halvesYears < 0L) { + years = (Math.abs(halvesYears) - 1) / 2L + 1L; + years *= -1L; + } else { + years = halvesYears / 2L; + } + + String half; + if (left != 0L) { + // odd number => 2nd half: "h2" + //half ="h2"; // + half = "-07-01"; + } else { + // even number => 1st half: "h1" + //half = "h1"; // + half = "-01-01"; + } + long year = 1960L + years; + String halfYear = Long.toString(year) + half; + logger.fine("rawDatum=" + rawDatum + ": halfYear=" + halfYear); + + decodedDateTime = halfYear; + format = "yyyy-MM-dd"; + logger.fine("th:" + decodedDateTime + ", format:" + format); + + } else if (FormatType.matches("^%t?y.*")) { + // year type's origin is 0 AD + decodedDateTime = rawDatum; + format = "yyyy"; + logger.fine("th:" + decodedDateTime); + } else { + decodedDateTime = rawDatum; + format = null; + } + DecodedDateTime retValue = new DecodedDateTime(); + retValue.decodedDateTime = decodedDateTime; + retValue.format = format; + return retValue; + } + + private class DTADataMap { + + private long dta_offset_stata_data = 0; + private long dta_offset_map = 0; + private long dta_offset_variable_types = 0; + private long dta_offset_varnames = 0; + private long dta_offset_sortlist = 0; + private long dta_offset_formats = 0; + private long dta_offset_value_label_names = 0; + private long dta_offset_variable_labels = 0; + private long dta_offset_characteristics = 0; + private long dta_offset_data = 0; + private long dta_offset_strls = 0; + private long dta_offset_value_labels = 0; + private long dta_offset_data_close = 0; + private long dta_offset_eof = 0; + + // getters: + public long getOffset_head() { + return dta_offset_stata_data; + } + + public long getOffset_map() { + return dta_offset_map; + } + + public long getOffset_types() { + return dta_offset_variable_types; + } + + public long getOffset_varnames() { + return dta_offset_varnames; + } + + public long getOffset_srtlist() { + return dta_offset_sortlist; + } + + public long getOffset_fmts() { + return dta_offset_formats; + } + + public long getOffset_vlblnames() { + return dta_offset_value_label_names; + } + + public long getOffset_varlabs() { + return dta_offset_variable_labels; + } + + public long getOffset_characteristics() { + return dta_offset_characteristics; + } + + public long getOffset_data() { + return dta_offset_data; + } + + public long getOffset_strls() { + return dta_offset_strls; + } + + public long getOffset_vallabs() { + return dta_offset_value_labels; + } + + public long getOffset_data_close() { + return dta_offset_data_close; + } + + public long getOffset_eof() { + return dta_offset_eof; + } + + // setters: + public void setOffset_head(long dta_offset_stata_data) { + this.dta_offset_stata_data = dta_offset_stata_data; + } + + public void setOffset_map(long dta_offset_map) { + this.dta_offset_map = dta_offset_map; + } + + public void setOffset_types(long dta_offset_variable_types) { + this.dta_offset_variable_types = dta_offset_variable_types; + } + + public void setOffset_varnames(long dta_offset_varnames) { + this.dta_offset_varnames = dta_offset_varnames; + } + + public void setOffset_srtlist(long dta_offset_sortlist) { + this.dta_offset_sortlist = dta_offset_sortlist; + } + + public void setOffset_fmts(long dta_offset_formats) { + this.dta_offset_formats = dta_offset_formats; + } + + public void setOffset_vlblnames(long dta_offset_value_label_names) { + this.dta_offset_value_label_names = dta_offset_value_label_names; + } + + public void setOffset_varlabs(long dta_offset_variable_labels) { + this.dta_offset_variable_labels = dta_offset_variable_labels; + } + + public void setOffset_characteristics(long dta_offset_characteristics) { + this.dta_offset_characteristics = dta_offset_characteristics; + } + + public void setOffset_data(long dta_offset_data) { + this.dta_offset_data = dta_offset_data; + } + + public void setOffset_strls(long dta_offset_strls) { + this.dta_offset_strls = dta_offset_strls; + } + + public void setOffset_vallabs(long dta_offset_value_labels) { + this.dta_offset_value_labels = dta_offset_value_labels; + } + + public void setOffset_data_close(long dta_offset_data_close) { + this.dta_offset_data_close = dta_offset_data_close; + } + + public void setOffset_eof(long dta_offset_eof) { + this.dta_offset_eof = dta_offset_eof; + } + } +} diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 83beb06af55..73e72df1cf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -110,6 +110,8 @@ public class FileUtil implements java.io.Serializable { public static final String MIME_TYPE_STATA = "application/x-stata"; public static final String MIME_TYPE_STATA13 = "application/x-stata-13"; + public static final String MIME_TYPE_STATA14 = "application/x-stata-14"; + public static final String MIME_TYPE_STATA15 = "application/x-stata-15"; public static final String MIME_TYPE_RDATA = "application/x-rlang-transport"; public static final String MIME_TYPE_CSV = "text/csv"; @@ -1089,6 +1091,10 @@ public static boolean ingestableAsTabular(String mimeType) { return true; } else if (mimeType.equals(MIME_TYPE_STATA13)) { return true; + } else if (mimeType.equals(MIME_TYPE_STATA14)) { + return true; + } else if (mimeType.equals(MIME_TYPE_STATA15)) { + return true; } else if (mimeType.equals(MIME_TYPE_RDATA)) { return true; } else if (mimeType.equals(MIME_TYPE_CSV) || mimeType.equals(MIME_TYPE_CSV_ALT)) { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java b/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java index 0fa5881b35d..33fe7dae383 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java @@ -3,12 +3,15 @@ import com.jayway.restassured.RestAssured; import com.jayway.restassured.path.json.JsonPath; import com.jayway.restassured.response.Response; +import java.io.File; +import java.util.Arrays; import java.util.logging.Logger; import static javax.ws.rs.core.Response.Status.CREATED; -import static javax.ws.rs.core.Response.Status.NOT_FOUND; import static javax.ws.rs.core.Response.Status.OK; import static org.hamcrest.CoreMatchers.equalTo; +import static org.junit.Assert.assertEquals; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; public class TabularIT { @@ -20,6 +23,7 @@ public static void setUpClass() { RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); } + @Ignore @Test public void testTabularFile() throws InterruptedException { Response createUser = UtilIT.createRandomUser(); @@ -49,52 +53,172 @@ public void testTabularFile() throws InterruptedException { .statusCode(OK.getStatusCode()); long fileId = JsonPath.from(uploadIngestableFile.body().asString()).getLong("data.files[0].dataFile.id"); String fileIdAsString = Long.toString(fileId); - String filePersistentId = JsonPath.from(uploadIngestableFile.body().asString()).getString("data.files[0].dataFile.persistentId"); +// String filePersistentId = JsonPath.from(uploadIngestableFile.body().asString()).getString("data.files[0].dataFile.persistentId"); System.out.println("fileId: " + fileId); - System.out.println("filePersistentId: " + filePersistentId); +// System.out.println("filePersistentId: " + filePersistentId); // Give file time to ingest - Thread.sleep(3000); + Thread.sleep(10000); - Response getMetaUsingPersistentId = UtilIT.getMetaDatafileDeprecated(filePersistentId, apiToken); - getMetaUsingPersistentId.then().assertThat() - .statusCode(NOT_FOUND.getStatusCode()); - - Response getMetaUsingId = UtilIT.getMetaDatafileDeprecated(fileIdAsString, apiToken); - getMetaUsingId.prettyPrint(); - getMetaUsingId.then().assertThat() - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .statusCode(OK.getStatusCode()); - - Response fileMetadataNoFormat = UtilIT.getFileMetadata(filePersistentId, null, apiToken); + Response fileMetadataNoFormat = UtilIT.getFileMetadata(fileIdAsString, null, apiToken); fileMetadataNoFormat.prettyPrint(); fileMetadataNoFormat.then().assertThat() - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .statusCode(OK.getStatusCode()); + .statusCode(OK.getStatusCode()) + .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); Response fileMetadataNoFormatFileId = UtilIT.getFileMetadata(fileIdAsString, null, apiToken); fileMetadataNoFormatFileId.prettyPrint(); fileMetadataNoFormatFileId.then().assertThat() - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .statusCode(OK.getStatusCode()); + .statusCode(OK.getStatusCode()) + .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); - Response fileMetadataDdi = UtilIT.getFileMetadata(filePersistentId, "ddi", apiToken); + Response fileMetadataDdi = UtilIT.getFileMetadata(fileIdAsString, "ddi", apiToken); fileMetadataDdi.prettyPrint(); fileMetadataDdi.then().assertThat() + .statusCode(OK.getStatusCode()) .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .statusCode(OK.getStatusCode()); + .body("codeBook.dataDscr.var[0].@name", equalTo("var1")) + // Yes, it's odd that we go from "var1" to "var3" to "var2" to "var5" + .body("codeBook.dataDscr.var[1].@name", equalTo("var3")) + .body("codeBook.dataDscr.var[2].@name", equalTo("var2")) + .body("codeBook.dataDscr.var[3].@name", equalTo("var5")); boolean testPreprocessedMetadataFormat = false; if (testPreprocessedMetadataFormat) { // If you don't have all the dependencies in place, such as Rserve, you might get a 503 and this error: // org.rosuda.REngine.Rserve.RserveException: Cannot connect: Connection refused - Response fileMetadataPreProcessed = UtilIT.getFileMetadata(filePersistentId, "preprocessed", apiToken); + Response fileMetadataPreProcessed = UtilIT.getFileMetadata(fileIdAsString, "preprocessed", apiToken); fileMetadataPreProcessed.prettyPrint(); fileMetadataPreProcessed.then().assertThat() - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .statusCode(OK.getStatusCode()); + .statusCode(OK.getStatusCode()) + .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); } } + @Ignore + @Test + public void test50by1000() { + // cp scripts/search/data/tabular/50by1000.dta /tmp + String fileName = "/tmp/50by1000.dta"; + String fileType = "application/x-stata"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 50", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata13TinyFile() { + // cp scripts/search/data/tabular/120745.dta /tmp + String fileName = "/tmp/120745.dta"; + String fileType = "application/x-stata"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 1", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata13Auto() { + // curl https://www.stata-press.com/data/r13/auto.dta > /tmp/stata13-auto.dta + String fileName = "/tmp/stata13-auto.dta"; + String fileType = "application/x-stata-13"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 12", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata14OpenSourceAtHarvard() { + // https://dataverse.harvard.edu/file.xhtml?fileId=3040230 converted to Stata 14: 2017-07-31.tab + // cp scripts/search/data/tabular/open-source-at-harvard118.dta /tmp + String fileName = "/tmp/open-source-at-harvard118.dta"; + // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml + String fileType = "application/x-stata-14"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 10", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata14Aggregated() { + // https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta + String fileName = "/tmp/2018_04_06_Aggregated_dataset_v2.dta"; + // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml + String fileType = "application/x-stata-14"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 227", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata14MmPublic() { + // TODO: This file was downloaded at random. We could keep trying to get it to ingest. + // https://dataverse.harvard.edu/file.xhtml?fileId=2775556 Stata 14: mm_public_120615_v14.dta + // For this file "hasSTRLs" is true so it might be nice to get it working. + String fileName = "/tmp/mm_public_120615_v14.dta"; + // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml + String fileType = "application/x-stata-14"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + // We don't know how many variables it has. Probably not 12. + assertEquals("NVARS: 12", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata15() { + // for i in `echo {0..33000}`; do echo -n "var$i,"; done > 33k.csv + // Then open Stata 15, run `set maxvar 40000` and import. + String fileName = "/tmp/33k.dta"; + String fileType = "application/x-stata-15"; + Response response = UtilIT.testIngest(fileName, fileType); + response.prettyPrint(); + assertEquals("NVARS: 33001", response.body().asString().split("\n")[0]); + } + + @Ignore + @Test + public void testStata13Multiple() { + String fileType = "application/x-stata-13"; + // From /usr/local/dvn-admin/stata on dvn-build + String stata13directory = "/tmp/stata-13"; + File folder = new File(stata13directory); + File[] listOfFiles = folder.listFiles(); + for (int i = 0; i < listOfFiles.length; i++) { + File file = listOfFiles[i]; + String filename = file.getName(); + String filenameFullPath = file.getAbsolutePath(); + Response response = UtilIT.testIngest(filenameFullPath, fileType); + String firstLine = response.body().asString().split("\n")[0]; + String[] parts = firstLine.split(":"); + String[] justErrors = Arrays.copyOfRange(parts, 1, parts.length); + System.out.println(i + "\t" + filename + "\t" + Arrays.toString(justErrors) + "\t" + firstLine); + } + } + + @Ignore + @Test + public void testStata14Multiple() { + String fileType = "application/x-stata-14"; + // From /usr/local/dvn-admin/stata on dvn-build + String stata13directory = "/tmp/stata-14"; + File folder = new File(stata13directory); + File[] listOfFiles = folder.listFiles(); + for (int i = 0; i < listOfFiles.length; i++) { + File file = listOfFiles[i]; + String filename = file.getName(); + String filenameFullPath = file.getAbsolutePath(); + Response response = UtilIT.testIngest(filenameFullPath, fileType); + String firstLine = response.body().asString().split("\n")[0]; + String[] parts = firstLine.split(":"); + String[] justErrors = Arrays.copyOfRange(parts, 1, parts.length); + System.out.println(i + "\t" + filename + "\t" + Arrays.toString(justErrors) + "\t" + firstLine); + } + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index 2f87477a162..2fd619cfda5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -533,15 +533,9 @@ static Response getFileMetadata(String fileIdOrPersistentId, String optionalForm .get("/api/access/datafile/" + idInPath + "/metadata" + optionalFormatInPath + "?key=" + apiToken + optionalQueryParam); } - static Response getMetaDatafileDeprecated(String fileIdOrPersistentId, String apiToken) { - String idInPath = fileIdOrPersistentId; // Assume it's a number. - String optionalQueryParam = ""; // If idOrPersistentId is a number we'll just put it in the path. - if (!NumberUtils.isNumber(fileIdOrPersistentId)) { - idInPath = ":persistentId"; - optionalQueryParam = "&persistentId=" + fileIdOrPersistentId; - } + static Response testIngest(String fileName, String fileType) { return given() - .get("/api/meta/datafile/" + idInPath + "?key=" + apiToken + optionalQueryParam); + .get("/api/ingest/test/file?fileName=" + fileName + "&fileType=" + fileType); } static Response getSwordAtomEntry(String persistentId, String apiToken) { diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java new file mode 100644 index 00000000000..2f8908c5920 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReaderTest.java @@ -0,0 +1,24 @@ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; + +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import org.junit.Test; +import static org.junit.Assert.assertEquals; + +public class DTAFileReaderTest { + + DTAFileReader instance = new DTAFileReader(null); + File nullDataFile = null; + + @Test + public void testOs() throws IOException { + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/50by1000.dta"))), nullDataFile); + assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); + assertEquals("rel_8_or_9", result.getDataTable().getOriginalFormatVersion()); + assertEquals(50, result.getDataTable().getDataVariables().size()); + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReaderTest.java new file mode 100644 index 00000000000..8ac84d9693a --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DataReaderTest.java @@ -0,0 +1,50 @@ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * @author oscardssmith + */ +public class DataReaderTest { + @Test + public void testReadInt() throws IOException { + byte[] bytes = ByteBuffer.allocate(4).putInt(-1).array(); + BufferedInputStream stream = new BufferedInputStream(new ByteArrayInputStream(bytes)); + DataReader reader = new DataReader(stream); + reader.setLSF(true); + assertEquals(-1, reader.readInt()); + } + + @Test + public void testReadUInt() throws IOException { + byte[] bytes = ByteBuffer.allocate(4).putInt(-1).array(); + BufferedInputStream stream = new BufferedInputStream(new ByteArrayInputStream(bytes)); + DataReader reader = new DataReader(stream); + reader.setLSF(true); + assertEquals(4294967295L, reader.readUInt()); + } + + @Test + public void testReadUShort() throws IOException { + byte[] bytes = ByteBuffer.allocate(2).putShort((short) -1).array(); + BufferedInputStream stream = new BufferedInputStream(new ByteArrayInputStream(bytes)); + DataReader reader = new DataReader(stream); + reader.setLSF(true); + assertEquals(65535, reader.readUShort()); + } + + // This should throw until we figure out what to do with uLongs that are large + @Test(expected = IOException.class) + public void testReadULong() throws IOException { + byte[] bytes = {-1,-1,-1,-1,-1,-1,-1,-1,}; + BufferedInputStream stream = new BufferedInputStream(new ByteArrayInputStream(bytes)); + DataReader reader = new DataReader(stream); + reader.setLSF(true); + assertEquals(-1, reader.readULong()); + } +} diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java new file mode 100644 index 00000000000..3c8c0a0d224 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/NewDTAFileReaderTest.java @@ -0,0 +1,147 @@ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta; + +import edu.harvard.iq.dataverse.DataTable; +import edu.harvard.iq.dataverse.datavariable.DataVariable; +import edu.harvard.iq.dataverse.datavariable.VariableCategory; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DataReader; +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import org.apache.commons.io.FileUtils; +import org.junit.Test; +import static org.junit.Assert.*; +import org.junit.Ignore; +import org.junit.Assert; + +public class NewDTAFileReaderTest { + NewDTAFileReader instance; + File nullDataFile = null; + private final String base = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/"; + + @Test + public void testAuto() throws IOException { + instance = new NewDTAFileReader(null, 117); + // From https://www.stata-press.com/data/r13/auto.dta + // `strings` shows "
117" + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("scripts/search/data/tabular/stata13-auto.dta"))), nullDataFile); + assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); + assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); + assertEquals(12, result.getDataTable().getDataVariables().size()); + DataVariable foreign = result.getDataTable().getDataVariables().get(11); + assertEquals(2, foreign.getCategories().size()); + List origins = (List) foreign.getCategories(); + assertEquals("Domestic", origins.get(0).getLabel()); + assertEquals("Foreign", origins.get(1).getLabel()); + } + + @Test + public void testStrl() throws IOException { + instance = new NewDTAFileReader(null, 118); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "strl.dta"))), nullDataFile); + DataTable table = result.getDataTable(); + assertEquals("application/x-stata", table.getOriginalFileFormat()); + assertEquals("STATA 14", table.getOriginalFormatVersion()); + assertEquals(7, table.getDataVariables().size()); + assertEquals(3, (long)table.getCaseQuantity()); + + String[] vars = {"make","price","mpg","rep78","trunk","gear_ratio","strls"}; + String[] actualVars = table.getDataVariables().stream().map((var) -> var.getName()).toArray(String[]::new); + Assert.assertArrayEquals(vars, actualVars); + String expected = "\"Buick LeSabre\" 5788 1.1111111111111111E21 100 32767 2.73 \"a\"\n" + + "\"Buick Opel\" 4453 26.0 10 2.87 \"bb\"\n" + + "\"Buick Regal\" 5189 20.0 3 16 2.93 \"ccc\"\n"; + assertEquals(expected, FileUtils.readFileToString(result.getTabDelimitedFile())); + } + + @Test + public void testDates() throws IOException { + instance = new NewDTAFileReader(null, 118); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File(base + "dates.dta"))), nullDataFile); + DataTable table = result.getDataTable(); + assertEquals("application/x-stata", table.getOriginalFileFormat()); + assertEquals("STATA 14", table.getOriginalFormatVersion()); + assertEquals(7, table.getDataVariables().size()); + assertEquals(4, (long)table.getCaseQuantity()); + String[] vars = {"Clock","Daily","Weekly","Monthly","Quarterly","BiAnnually","Annually"}; + String[] actualVars = table.getDataVariables().stream().map((var) -> var.getName()).toArray(String[]::new); + Assert.assertArrayEquals(vars, actualVars); + String expected = "2595-09-27 06:58:52.032 2018-06-20 2018-11-05 2018-06-01 2018-01-01 2018-01-01 2018\n" + + "2595-09-27 06:58:52.032 2018-06-20 2018-11-05 2018-06-01 2018-04-01 2018-01-01 2018\n" + + "2595-09-27 06:58:52.032 2018-06-20 2018-11-05 2018-06-01 2018-07-01 2018-07-01 2018\n" + + "2595-09-27 06:58:52.032 2018-06-20 2018-11-05 2018-06-01 2018-11-01 2018-07-01 2018\n"; + assertEquals(expected, FileUtils.readFileToString(result.getTabDelimitedFile())); + } + + @Test(expected = IOException.class) + public void testNull() throws IOException { + instance = new NewDTAFileReader(null, 117); + TabularDataIngest result = instance.read(null, new File("")); + } + + // TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue? + @Ignore + @Test + public void testFirstCategoryNonZeroOffset() throws IOException { + instance = new NewDTAFileReader(null, 117); + + // https://dataverse.harvard.edu/file.xhtml?fileId=2865667 Stata 13 HouseImputingCivilRightsInfo.dta md5=7dd144f27cdb9f8d1c3f4eb9c4744c42 + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/HouseImputingCivilRightsInfo.dta"))), nullDataFile); + assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); + assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); + assertEquals(5, result.getDataTable().getDataVariables().size()); + DataVariable imputing = result.getDataTable().getDataVariables().get(4); + assertEquals("imputingincludes10perofmembers", imputing.getName()); + assertEquals("Dummy Variable: 1 = More than 10% of votes cast were imputed; 0 = Less than 10%", imputing.getLabel()); + assertEquals(2, imputing.getCategories().size()); + List origins = (List) imputing.getCategories(); + // Given the MD5 above, we expect the categories to come out in the order below. + assertEquals("Fewer than 10% Imputed", origins.get(0).getLabel()); + assertEquals("More than 10% Imputed", origins.get(1).getLabel()); + } + + // TODO: Can we create a small file to check into the code base that exercises the value-label names non-zero offset issue? + @Ignore + @Test + public void testFirstCategoryNonZeroOffset1() throws IOException { + instance = new NewDTAFileReader(null, 118); + // https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/2018_04_06_Aggregated_dataset_v2.dta"))), nullDataFile); + assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); + assertEquals("STATA 14", result.getDataTable().getOriginalFormatVersion()); + assertEquals(227, result.getDataTable().getDataVariables().size()); + DataVariable q10 = result.getDataTable().getDataVariables().get(25); + assertEquals("Q10", q10.getName()); + assertEquals("Matching party leaders pics", q10.getLabel()); + assertEquals(2, q10.getCategories().size()); + List matching = (List) q10.getCategories(); + // Given the MD5 above, we expect the categories to come out in the order below. + assertEquals("None matched", matching.get(0).getLabel()); + assertEquals("All matched", matching.get(1).getLabel()); + } + + // TODO: Is there a way to exersise this code with a smaller file? 33k.dta is 21MB. + @Ignore + @Test + public void test33k() throws IOException { + instance = new NewDTAFileReader(null, 119); + // for i in `echo {0..33000}`; do echo -n "var$i,"; done > 33k.csv + // Then open Stata 15, run `set maxvar 40000` and import. + } + + // TODO: Can we create a small file to check into the code base that exercises the characteristics issue? + // FIXME: testCharacteristics is passing in DTA117FileReaderTest but not here. + @Ignore + @Test + public void testCharacteristics() throws IOException { + instance = new NewDTAFileReader(null, 117); + TabularDataIngest result = instance.read(new BufferedInputStream(new FileInputStream(new File("/tmp/15aa6802ee5-5d2ed1bf55a5.dta"))), nullDataFile); + assertEquals("application/x-stata", result.getDataTable().getOriginalFileFormat()); + assertEquals("STATA 13", result.getDataTable().getOriginalFormatVersion()); + assertEquals(441, result.getDataTable().getDataVariables().size()); + } +} diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/dates.dta b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/dates.dta new file mode 100644 index 00000000000..08bcae02cda Binary files /dev/null and b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/dates.dta differ diff --git a/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/strl.dta b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/strl.dta new file mode 100644 index 00000000000..2d267fa7d89 Binary files /dev/null and b/src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/strl.dta differ