Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

20490: Unifies .nas, and .nan under null keyword, MAJOR #155

Merged
merged 17 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ <h2>Language syntax</h2>
(call (retrieve_from_contained_entity "MyLibrary" "MyFunction") (assoc parameter_a 1 parameter_b 2))
</span>
<p>
Numbers are represented via numbers, as well as ".", "-", and "e" for base-ten exponents. Further, infinity and negative infinity are represented as ".infinity" and "-.infinity" respectively, and not-a-number is represented as ".nan". The special null value with string type, not-a-string, is represented as ".nas".
Numbers are represented via numbers, as well as ".", "-", and "e" for base-ten exponents. Further, infinity and negative infinity are represented as ".infinity" and "-.infinity" respectively. Not-a-number and non-string results are represented via the opcode (null).
<p>
All regular expressions are EMCA-standard regular expressions. See <a href="https://en.cppreference.com/w/cpp/regex/ecmascript">https://en.cppreference.com/w/cpp/regex/ecmascript</a> or <a href="https://262.ecma-international.org/5.1/#sec-15.10">https://262.ecma-international.org/5.1/#sec-15.10</a> for further details on syntax.

Expand Down
2 changes: 1 addition & 1 deletion docs/language.js
Original file line number Diff line number Diff line change
Expand Up @@ -1549,7 +1549,7 @@ var data = [
"parameter" : "query_mode string label_name [string weight_label_name] [bool numeric]",
"output" : "query",
"new value" : "new",
"description" : "When used as a query argument, finds the statistical mode of label_name for numerical data. If weight_label_name is specified, it will find the weighted mode. If numeric is true, its default, then it will treat all values as numeric, otherwise it will treat them all as strings. If numeric and no numeric mode exists, it will return .nan, but if string and no string mode exists, it will return null.",
"description" : "When used as a query argument, finds the statistical mode of label_name for numerical data. If weight_label_name is specified, it will find the weighted mode. If numeric is true, its default, then it will treat all values as numeric, otherwise it will treat them all as strings. If numeric and no numeric mode exists, it will return (null), but if string and no string mode exists, it will return null.",
howsohazard marked this conversation as resolved.
Show resolved Hide resolved
"example" : "(compute_on_contained_entities \"TestEntity\" (list\n (query_mode \"TargetLabel\")\n))"
},

Expand Down
12 changes: 6 additions & 6 deletions src/Amalgam/GeneralizedDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,8 +373,8 @@ class GeneralizedDistanceEvaluator
__forceinline double ComputeDistanceTermNominal(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy)
{
bool a_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(a_type, a);
bool b_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(b_type, b);
bool a_is_null = EvaluableNodeImmediateValue::IsNull(a_type, a);
bool b_is_null = EvaluableNodeImmediateValue::IsNull(b_type, b);
if(a_is_null && b_is_null)
return ComputeDistanceTermUnknownToUnknown(index, high_accuracy);

Expand Down Expand Up @@ -754,8 +754,8 @@ class GeneralizedDistanceEvaluator
__forceinline double LookupNullDistanceTerm(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy)
{
bool a_unknown = EvaluableNodeImmediateValue::IsNullEquivalent(a_type, a);
bool b_unknown = EvaluableNodeImmediateValue::IsNullEquivalent(b_type, b);
bool a_unknown = EvaluableNodeImmediateValue::IsNull(a_type, a);
bool b_unknown = EvaluableNodeImmediateValue::IsNull(b_type, b);
if(a_unknown && b_unknown)
return ComputeDistanceTermUnknownToUnknown(index, high_accuracy);
if(a_unknown || b_unknown)
Expand Down Expand Up @@ -1245,9 +1245,9 @@ class RepeatedGeneralizedDistanceEvaluator
return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
}

if(EvaluableNodeImmediateValue::IsNullEquivalent(other_type, other_value))
if(EvaluableNodeImmediateValue::IsNull(other_type, other_value))
{
if(feature_data.targetValue.IsNullEquivalent())
if(feature_data.targetValue.IsNull())
return distEvaluator->ComputeDistanceTermUnknownToUnknown(index, high_accuracy);
else
return distEvaluator->ComputeDistanceTermKnownToUnknown(index, high_accuracy);
Expand Down
3 changes: 1 addition & 2 deletions src/Amalgam/Opcodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ void StringInternPool::InitializeStaticStrings()
stringToID.reserve(numStaticStrings);
idToStringAndRefCount.resize(numStaticStrings);

EmplaceStaticString(ENBISI_NOT_A_STRING, ".nas");
EmplaceStaticString(ENBISI_NOT_A_STRING, "(null)");
EmplaceStaticString(ENBISI_EMPTY_STRING, "");


Expand Down Expand Up @@ -293,7 +293,6 @@ void StringInternPool::InitializeStaticStrings()
//end opcodes

//built-in common values
EmplaceStaticString(ENBISI_nan, ".nan");
EmplaceStaticString(ENBISI_infinity, ".infinity");
EmplaceStaticString(ENBISI_neg_infinity, "-.infinity");
EmplaceStaticString(ENBISI_zero, "0");
Expand Down
4 changes: 1 addition & 3 deletions src/Amalgam/Opcodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -509,9 +509,7 @@ enum EvaluableNodeBuiltInStringId
//leave space for ENT_ opcodes, start at the end

//built-in common values
ENBISI_nas = NUM_VALID_ENT_OPCODES + NUM_ENBISI_SPECIAL_STRING_IDS,
ENBISI_nan,
ENBISI_infinity,
ENBISI_infinity = NUM_VALID_ENT_OPCODES + NUM_ENBISI_SPECIAL_STRING_IDS,
ENBISI_neg_infinity,
ENBISI_zero,
ENBISI_one,
Expand Down
33 changes: 24 additions & 9 deletions src/Amalgam/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,18 +510,10 @@ EvaluableNode *Parser::GetNextToken(EvaluableNode *parent_node, EvaluableNode *n

//check for special values
double value = 0.0;
if(s == ".nas")
{
new_token->SetType(ENT_STRING, evaluableNodeManager, false);
new_token->SetStringID(StringInternPool::NOT_A_STRING_ID);
return new_token;
}
if(s == ".infinity")
value = std::numeric_limits<double>::infinity();
else if(s == "-.infinity")
value = -std::numeric_limits<double>::infinity();
else if(s == ".nan")
value = std::numeric_limits<double>::quiet_NaN();
else
{
auto [converted_value, success] = Platform_StringToNumber(s);
Expand Down Expand Up @@ -598,6 +590,29 @@ EvaluableNode *Parser::ParseNextBlock()
}
else if(cur_node->IsAssociativeArray())
{
//if it's not an immediate value, then need to retrieve closing parenthesis
if(!IsEvaluableNodeTypeImmediate(n->GetType()))
{
SkipWhitespaceAndAccumulateAttributes(n);
if(pos <= code->size())
{
auto cur_char = (*code)[pos];
if(cur_char == ')')
{
pos++;
numOpenParenthesis--;
}
else
{
std::cerr << "Warning: " << "Missing ) at line " << lineNumber + 1 << " of " << originalSource << std::endl;
}
}
else //no more code
{
std::cerr << "Warning: " << "Mismatched ) at line " << lineNumber + 1 << " of " << originalSource << std::endl;
}
}

//n is the id, so need to get the next token
StringInternPool::StringID index_sid = EvaluableNode::ToStringIDTakingReferenceAndClearing(n);

Expand Down Expand Up @@ -863,7 +878,7 @@ void Parser::Unparse(UnparseData &upd, EvaluableNode *tree, EvaluableNode *paren
auto sid = tree->GetStringIDReference();
if(sid == string_intern_pool.NOT_A_STRING_ID)
{
upd.result.append(".nas");
upd.result.append("(null)");
}
else //legitimate string
{
Expand Down
116 changes: 23 additions & 93 deletions src/Amalgam/SBFDSColumnData.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class SBFDSColumnData
//indicates the column does not use indices
static constexpr size_t NO_INDEX = std::numeric_limits<size_t>::max();
//nan value is always the 0th index
static constexpr size_t NAN_INDEX = 0;
static constexpr size_t NULL_INDEX = 0;

//if empty, initialize to invalid index
ValueEntry()
Expand Down Expand Up @@ -74,10 +74,7 @@ class SBFDSColumnData
else if(value_type == ENIVT_NUMBER)
{
numberIndices.insert(index);
if(FastIsNaN(value.number))
nanIndices.insert(index);
else
entities_with_number_values.emplace_back(value.number, index);
entities_with_number_values.emplace_back(value.number, index);
}
else if(value_type == ENIVT_STRING_ID)
{
Expand Down Expand Up @@ -204,7 +201,7 @@ class SBFDSColumnData
}

if(internedNumberValues.valueInterningEnabled)
return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
return EvaluableNodeImmediateValue(ValueEntry::NULL_INDEX);
else
return EvaluableNodeImmediateValue();
}
Expand All @@ -221,28 +218,9 @@ class SBFDSColumnData
{
double old_number_value = GetResolvedValue(old_value_type, old_value).number;
double new_number_value = GetResolvedValue(new_value_type, new_value).number;
if(EqualIncludingNaN(old_number_value, new_number_value))
if(old_number_value == new_number_value)
howsoRes marked this conversation as resolved.
Show resolved Hide resolved
return old_value;

//if made it here, then at least one of the values is not a NaN
//if one value is a NaN, just insert or delete as regular since there's little to be saved
if(FastIsNaN(old_number_value))
{
nanIndices.erase(index);
return InsertIndexValue(new_value_type, new_value, index);
}

if(FastIsNaN(new_number_value))
{
DeleteIndexValue(old_value_type, old_value, index);
nanIndices.insert(index);

if(internedNumberValues.valueInterningEnabled)
return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
else
return EvaluableNodeImmediateValue(std::numeric_limits<double>::quiet_NaN());
}

//if the value already exists, then put the index in the list
//but return the lower bound if not found so don't have to search a second time
//need to search the old value before inserting, as FindExactIndexForValue is fragile a placeholder empty entry
Expand Down Expand Up @@ -453,25 +431,24 @@ class SBFDSColumnData

case ENIVT_NUMBER:
case ENIVT_NUMBER_INDIRECTION_INDEX:
{
numberIndices.erase(index);

//remove, and if not a nan, then need to also remove the number
if(!nanIndices.EraseAndRetrieve(index))
{
auto resolved_value = GetResolvedValue(value_type, value);
auto resolved_value = GetResolvedValue(value_type, value);

//look up value
auto [value_index, exact_index_found] = FindExactIndexForValue(resolved_value.number);
if(!exact_index_found)
return;
//look up value
auto [value_index, exact_index_found] = FindExactIndexForValue(resolved_value.number);
if(!exact_index_found)
return;

//if the bucket has only one entry, we must delete the entire bucket
if(sortedNumberValueEntries[value_index]->indicesWithValue.size() == 1)
DeleteNumberValueEntry(value_index);
else //else we can just remove the id from the bucket
sortedNumberValueEntries[value_index]->indicesWithValue.erase(index);

//if the bucket has only one entry, we must delete the entire bucket
if(sortedNumberValueEntries[value_index]->indicesWithValue.size() == 1)
DeleteNumberValueEntry(value_index);
else //else we can just remove the id from the bucket
sortedNumberValueEntries[value_index]->indicesWithValue.erase(index);
}
break;
}

case ENIVT_STRING_ID:
{
Expand Down Expand Up @@ -541,7 +518,7 @@ class SBFDSColumnData
invalidIndices.insert(index);

if(internedNumberValues.valueInterningEnabled)
return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
return EvaluableNodeImmediateValue(ValueEntry::NULL_INDEX);
else
return value;
}
Expand All @@ -551,7 +528,7 @@ class SBFDSColumnData
nullIndices.insert(index);

if(internedNumberValues.valueInterningEnabled)
return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
return EvaluableNodeImmediateValue(ValueEntry::NULL_INDEX);
else
return value;
}
Expand All @@ -561,15 +538,6 @@ class SBFDSColumnData
numberIndices.insert(index);

double number_value = GetResolvedValue(value_type, value).number;
if(FastIsNaN(number_value))
{
nanIndices.insert(index);

if(internedNumberValues.valueInterningEnabled)
return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
else
return value;
}

//if the value already exists, then put the index in the list
//but return the lower bound if not found so don't have to search a second time
Expand Down Expand Up @@ -784,8 +752,7 @@ class SBFDSColumnData
}

//given a feature_id and a range [low, high], inserts all the elements with values of feature feature_id within specified range into out; does not clear out
//Note about Null/NaNs:
//if the feature value is Nan/Null, it will NOT be present in the search results, ie "x" != 3 will NOT include elements with x is nan/Null, even though nan/null != 3
//if the feature value is null, it will NOT be present in the search results, ie "x" != 3 will NOT include elements with x is null, even though null != 3
void FindAllIndicesWithinRange(EvaluableNodeImmediateValueType value_type,
EvaluableNodeImmediateValue &low, EvaluableNodeImmediateValue &high, BitArrayIntegerSet &out, bool between_values = true)
{
Expand All @@ -801,26 +768,9 @@ class SBFDSColumnData

if(FastIsNaN(low_number) || FastIsNaN(high_number))
{
//both are NaN
//both are NaN, return nothing
if(FastIsNaN(low_number) && FastIsNaN(high_number))
{
//if looking for NaN
if(between_values)
{
nanIndices.CopyTo(out);
}
else //looking for anything but NaN
{
numberIndices.CopyTo(out);
nanIndices.EraseTo(out);
}

return;
}

//if NaN specified and within range, then we want to include NaN indices
if(between_values)
nanIndices.CopyTo(out);

//modify range to include elements from or up to -/+inf
if(FastIsNaN(low_number)) //find all NaN values and all values up to max
Expand All @@ -839,10 +789,7 @@ class SBFDSColumnData
if(between_values)
return;
else //the value doesn't exist, include everything
{
//include nans
numberIndices.CopyTo(out);
}
}

//if within range, and range has no length, just return indices in that one bucket
Expand All @@ -853,9 +800,6 @@ class SBFDSColumnData
}
else //if not within, populate with all indices not equal to value
{
//include nans
nanIndices.CopyTo(out);

for(auto &value_entry : sortedNumberValueEntries)
{
if(value_entry->value.number == low_number)
Expand Down Expand Up @@ -942,13 +886,6 @@ class SBFDSColumnData
}
else if(value_type == ENIVT_NUMBER)
{
if(FastIsNaN(value.number))
{
//only want nans
nanIndices.UnionTo(out);
return;
}

auto [value_index, exact_index_found] = FindExactIndexForValue(value.number);
if(exact_index_found)
out.InsertInBatch(sortedNumberValueEntries[value_index]->indicesWithValue);
Expand Down Expand Up @@ -1135,10 +1072,6 @@ class SBFDSColumnData
//indices of entities with a null for this feature
EfficientIntegerSet nullIndices;

//indices of entities with a NaN for this feature
// the entities will also be included in numberIndices
EfficientIntegerSet nanIndices;

//indices that don't fall into the number/string/null types but are valid
EfficientIntegerSet codeIndices;

Expand Down Expand Up @@ -1215,11 +1148,11 @@ class SBFDSColumnData
else //not valid, clear queue
{
unusedValueIndices.clear();
//just use a new value, 0-based but leaving a spot open for NAN_INDEX
//just use a new value, 0-based but leaving a spot open for NULL_INDEX
value_entry->valueInternIndex = total_num_values;
}
}
else //just use new value of the latest size, 0-based but leaving a spot open for NAN_INDEX
else //just use new value of the latest size, 0-based but leaving a spot open for NULL_INDEX
{
value_entry->valueInternIndex = total_num_values;
}
Expand Down Expand Up @@ -1272,9 +1205,6 @@ class SBFDSColumnData
return ValueType();
}();

//static constexpr ValueType notAValue = std::conditional_t<std::is_same_v<ValueType, double>,
// std::numeric_limits<double>::quiet_NaN(), ValueType()>();

//if valueInterningEnabled is true, then contains each value for the given index
//if a given index isn't used, then it will contain the maximum value for the index
//the 0th index is reserved for values that are not of type ValueType,
Expand Down
Loading
Loading