Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow different data types per curve in data section reader #461

Merged
merged 6 commits into from
Apr 26, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 42 additions & 58 deletions lasio/las.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
import re
import sys
import traceback

# get basestring in py3

Expand Down Expand Up @@ -89,6 +90,7 @@ def read(
ignore_data_comments="#",
mnemonic_case="upper",
index_unit=None,
dtypes="auto",
**kwargs
):
"""Read a LAS file.
Expand All @@ -112,6 +114,13 @@ def read(
'upper': convert all HeaderItem mnemonics to uppercase
'lower': convert all HeaderItem mnemonics to lowercase
index_unit (str): Optionally force-set the index curve's unit to "m" or "ft"
dtypes ("auto", dict or list): specify the data types for each curve in the
~ASCII data section. If "auto", each curve will be converted to floats if
possible and remain as str if not. If a dict you can specify only the
curve mnemonics you want to convert as a key. If a list, please specify
data types for each curve in order. Note that the conversion currently
only occurs via numpy.ndarray.astype() and therefore only a few simple
casts will work e.g. `int`, `float`, `str`.

See :func:`lasio.reader.open_with_codecs` for additional keyword
arguments which help to manage issues relate to character encodings.
Expand Down Expand Up @@ -261,16 +270,28 @@ def read(
ignore_comments=ignore_data_comments,
)

# How many curves should the reader attempt to find?
reader_n_columns = n_columns
if reader_n_columns == -1:
reader_n_columns = len(self.curves)

file_obj.seek(k)

# Convert dtypes passed as dict into list for all columns
# defaulting to float for any not specified.
if isinstance(dtypes, dict):
dtypes = [dtypes.get(c.mnemonic, float) for c in self.curves]

# Notes see 2d9e43c3 and e960998f for 'try' background
try:
arr = reader.read_data_section_iterative(
curves_data_gen = reader.read_data_section_iterative(
file_obj,
(first_line, last_line),
regexp_subs,
value_null_subs,
ignore_comments=ignore_data_comments,
n_columns=n_columns,
n_columns=reader_n_columns,
dtypes=dtypes,
)
except KeyboardInterrupt:
raise
Expand All @@ -279,68 +300,31 @@ def read(
traceback.format_exc()[:-1]
+ " in data section beginning line {}".format(i + 1)
)
logger.debug(
"Read ndarray {arrshape} from data section".format(
arrshape=arr.shape
)
)

# This is so we can check data size and use self.set_data(data, truncate=False)
# in cases of data.size is zero.
data = arr

if data.size > 0:
# TODO: check whether this treatment of NULLs is correct
logger.debug("~A data {}".format(arr))
if version_NULL:
arr[arr == provisional_null] = np.nan
logger.debug("~A after NULL replacement data {}".format(arr))

# Provisionally, assume that the number of columns represented
# by the data section's array is equal to the number of columns
# defined in the Curves/Definition section.

n_columns_in_arr = len(self.curves)
# Assign data to curves.
n = 0
for curve_arr in curves_data_gen:

# If we are told the file is unwrapped, then we assume that each
# column detected is a column, and we ignore the Curves/Definition
# section's number of columns instead.

if provisional_wrapped == "NO":
n_columns_in_arr = n_columns

# ---------------------------------------------------------------------
# TODO:
# This enables tests/test_read.py::test_barebones_missing_all_sections
# to pass, but may not be the complete or final solution.
# ---------------------------------------------------------------------
if len(self.curves) == 0 and n_columns > 0:
n_columns_in_arr = n_columns
# Do not replace nulls in the index curve.
if version_NULL and curve_arr.dtype == float and n != 0:
logger.debug(
"Replacing {} with nan in {}-th curve".format(
provisional_null, n
)
)
curve_arr[curve_arr == provisional_null] = np.nan

logger.debug(
"Data array (size {}) assumed to have {} columns "
"({} curves defined)".format(
arr.shape, n_columns_in_arr, len(self.curves)
)
"Assigning data {} to curve #{}".format(curve_arr, n)
)
if n < len(self.curves):
self.curves[n].data = curve_arr
else:
logger.debug("Creating new curve")
curve = CurveItem(mnemonic="", data=curve_arr)
self.curves.append(curve)
n += 1

# We attempt to reshape the 1D array read in from
# the data section so that it can be assigned to curves.
try:
data = np.reshape(arr, (-1, n_columns_in_arr))
except ValueError as exception:
error_message = "Cannot reshape ~A data size {0} into {1} columns".format(
arr.shape, n_columns_in_arr
)
if sys.version_info.major < 3:
exception.message = error_message
raise exception
else:
raise ValueError(error_message).with_traceback(
exception.__traceback__
)

self.set_data(data, truncate=False)
finally:
if hasattr(file_obj, "close"):
file_obj.close()
Expand Down
93 changes: 81 additions & 12 deletions lasio/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,15 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, ignore_comments="#"):
try:
assert len(set(item_counts)) == 1
except AssertionError:
logger.debug("Inconsistent number of columns {}".format(item_counts))
return -1
else:
logger.debug("Consistently found {} columns".format(item_counts[0]))
return item_counts[0]


def read_data_section_iterative(
file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns
file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns, dtypes
):
"""Read data section into memory.

Expand All @@ -381,14 +383,20 @@ def read_data_section_iterative(
value_null_subs (list): list of numerical values to be replaced by
numpy.nan values.
ignore_comments (str): lines beginning with this character will be ignored
n_columns (int, None): expected number of columns, or None/-1 if unknown
n_columns (int): expected number of columns
dtypes (list, "auto", False): list of expected data types for each column,
(each data type can be specified as e.g. `int`,
`float`, `str`, `datetime`). If you specify 'auto', then this function
will attempt to convert each column to a float and if that fails,
the column will be returned as a string. If you specify False, no
conversion of data types will be attempt at all.

Returns:
A 1-D numpy ndarray.
Returns: generator which yields the data as a 1D ndarray for each column at a time.

"""
if n_columns == -1:
n_columns = None
logger.debug(
"Attempting to read {} columns between lines {}".format(n_columns, line_nos)
)

title = file_obj.readline()

Expand Down Expand Up @@ -421,12 +429,17 @@ def items(f, start_line_no, end_line_no):
)
for value in value_null_subs:
array[array == value] = np.nan
logger.debug("Successfully read {} items in data section".format(len(array)))

if not n_columns is None:
logger.debug(
"Attempting to re-shape into 2D array with {} columns".format(n_columns)
)
logger.debug("Read {} items in data section".format(len(array)))

# Cater for situations where the data section is empty.
if len(array) == 0:
logger.warning("Data section is empty therefore setting n_columns to zero")
n_columns = 0

# Re-shape the 1D array to a 2D array.
if n_columns > 0:
logger.debug("Attempt re-shape to {} columns".format(n_columns))
try:
array = np.reshape(array, (-1, n_columns))
except ValueError as exception:
Expand All @@ -439,7 +452,63 @@ def items(f, start_line_no, end_line_no):
else:
raise ValueError(error_message).with_traceback(exception.__traceback__)

return array
# Identify how many columns have actually been found.
if len(array.shape) < 2:
arr_n_cols = 0
else:
arr_n_cols = array.shape[1]

# Identify what the appropriate data types should be for each column based on the first
# row of the data.
if dtypes == "auto":
if len(array) > 0:
dtypes = identify_dtypes_from_data(array[0, :])
else:
dtypes = []
elif dtypes is False:
dtypes = [str for n in range(arr_n_cols)]

# Iterate over each column, convert to the appropriate dtype (if possible)
# and then yield the data column.
for col_idx in range(arr_n_cols):
curve_arr = array[:, col_idx]
curve_dtype = dtypes[col_idx]
try:
curve_arr = curve_arr.astype(curve_dtype, copy=False)
except ValueError:
logger.warning(
"Could not convert curve #{} to {}".format(col_idx, curve_dtype)
)
else:
logger.debug(
"Converted curve {} to {} ({})".format(col_idx, curve_dtype, curve_arr)
)
yield curve_arr


def identify_dtypes_from_data(row):
"""Identify which columns should be 'str' and which 'float'.

Args:
row (1D ndarray): first row of data section

Returns: list of [float, float, str, ...] etc

"""
logger.debug("Creating auto dtype spec from first line of data array")
dtypes_list = []
for i in range(len(row)):
value = row[i]
try:
value_converted = float(value)
except:
dtypes_list.append(str)
else:
dtypes_list.append(float)
logger.debug(
"Column {}: value {} -> dtype {}".format(i, value, dtypes_list[-1])
)
return dtypes_list


def get_substitutions(read_policy, null_policy):
Expand Down
27 changes: 27 additions & 0 deletions tests/examples/sample_str_in_data.las
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
~VERSION INFORMATION
VERS. 1.2: CWLS LOG ASCII STANDARD -VERSION 1.2
WRAP. NO: ONE LINE PER DEPTH STEP
~WELL INFORMATION BLOCK
STRT.M 1670.000000:
STOP.M 1669.750000:
STEP.M -0.1250:
NULL. -999.2500:
COMP. COMPANY: # ANY OIL COMPANY LTD.
WELL. WELL: ANY ET AL OIL WELL #12
FLD . FIELD: EDAM
LOC . LOCATION: A9-16-49-20W3M
PROV. PROVINCE: SASKATCHEWAN
SRVC. SERVICE COMPANY: ANY LOGGING COMPANY LTD.
DATE. LOG DATE: 25-DEC-1988
UWI . UNIQUE WELL ID: 100091604920W300
~CURVE INFORMATION
DEPT.M : 1 DEPTH
DT_STR .US/M : 2 SONIC TRANSIT TIME
RHOB_INT.K/M3 : 3 BULK DENSITY
NPHI_FLOAT.V/V : 4 NEUTRON POROSITY
~PARAMETER INFORMATION
~Other
~A DEPTH DT RHOB NPHI
1670.000 123.450 2550.000 0.450
1669.875 123.450 2550.000 0.450
1669.750 123.450 2550.000 0.450
33 changes: 33 additions & 0 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numbers import Number

import lasio
import lasio.examples

test_dir = os.path.dirname(__file__)

Expand Down Expand Up @@ -446,3 +447,35 @@ def test_read_v2_sample_empty_other_section():
las = lasio.read(stegfn("2.0", "sample_2.0_empty_other_section.las"))
assert las.other == ""
assert las.data[0][0] == 1670.0


def test_sample_dtypes_specified():
las = lasio.examples.open(
"sample_str_in_data.las", read_policy=[], dtypes=[float, str, int, float]
)
# DT_STR
assert isinstance(las.curves[1].data[0], str)
# RHOB_INT
# assert isinstance(las.curves[2].data[0], int)
# The above fails because dtypes are fun - instead we check the opposite:
assert not isinstance(las.curves[2].data[0], float)
# NPHI_FLOAT
assert isinstance(las.curves[3].data[0], float)


def test_sample_dtypes_specified_as_dict():
las = lasio.examples.open(
"sample_str_in_data.las", read_policy=[], dtypes={"NPHI_FLOAT": str}
)
# RHOB_INT -> float by default
assert isinstance(las.curves[2].data[0], float)
# NPHI_FLOAT -> str by specification
assert isinstance(las.curves[3].data[0], str)


def test_sample_dtypes_specified_as_false():
las = lasio.examples.open("sample_str_in_data.las", read_policy=[], dtypes=False)
assert isinstance(las.curves[0].data[0], str)
assert isinstance(las.curves[1].data[0], str)
assert isinstance(las.curves[2].data[0], str)
assert isinstance(las.curves[3].data[0], str)