Skip to content

Commit

Permalink
Add serialization and deserialization of numerals larger than `Number…
Browse files Browse the repository at this point in the history
….MAX_SAFE_INTEGER` (#544) (#554)

Signed-off-by: Miki <miki@amazon.com>
  • Loading branch information
AMoo-Miki authored Jul 12, 2023
1 parent 0c6a70f commit e519441
Show file tree
Hide file tree
Showing 5 changed files with 363 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
## [Unreleased]

### Added

- Add serialization and deserialization of numerals larger than `Number.MAX_SAFE_INTEGER` ([#544](https://github.com/opensearch-project/opensearch-js/pull/544))

### Dependencies
### Changed

Expand Down
243 changes: 241 additions & 2 deletions lib/Serializer.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,60 @@ const sjson = require('secure-json-parse');
const { SerializationError, DeserializationError } = require('./errors');
const kJsonOptions = Symbol('secure json parse options');

/* In JavaScript, a `Number` is a 64-bit floating-point value which can store 16 digits. However, the
* serializer and deserializer will need to cater to numeric values generated by other languages which
* can have up to 19 digits. Native JSON parser and stringifier, incapable of handling the extra
* digits, corrupt the values, making them unusable.
*
* To work around this limitation, the deserializer converts long sequences of digits into strings and
* marks them before applying the parser. During the parsing, string values that begin with the mark
* are converted to `BigInt` values.
* Similarly, during stringification, the serializer converts `BigInt` values to marked strings and
* when done, it replaces them with plain numerals.
*
* `Number.MAX_SAFE_INTEGER`, 9,007,199,254,740,991, is the largest number that the native methods can
* parse and stringify, and any numeral greater than that would need to be translated using the
* workaround; all 17-digits or longer and only tail-end of the 16-digits need translation. It would
* be unfair to all the 16-digit numbers if the translation applied to `\d{16,}` only to cover the
* less than 10%. Hence, a RegExp is created to only match numerals too long to be a number.
*
* To make the explanation simpler, let's assume that MAX_SAFE_INTEGER is 8921 which has 4 digits.
* Starting from the right, we take each digit onwards, `[<start>-9]`:
* 1) 7922 - 7929: 792[2-9]\d{0}
* 2) 7930 - 7999: 79[3-9]\d{1}
* 9) 9 + 1 = 10 which results in a rollover; no need to do anything.
* 8) 9000 - 9999: [9-9]\d{3}
* Finally we add anything 5 digits or longer: `\d{5,}
*
* PS, a better solution would use AST but considering its performance penalty, RegExp is the next
* the best solution.
*/
const isBigIntSupported = typeof BigInt !== 'undefined';
const maxIntAsString = String(Number.MAX_SAFE_INTEGER);
const maxIntLength = maxIntAsString.length;
// Sub-patterns for each digit
const bigIntMatcherTokens = [`\\d{${maxIntAsString.length + 1},}`];
for (let i = 0; i < maxIntLength; i++) {
if (maxIntAsString[i] !== '9') {
bigIntMatcherTokens.push(
maxIntAsString.substring(0, i) +
`[${parseInt(maxIntAsString[i], 10) + 1}-9]` +
`\\d{${maxIntLength - i - 1}}`
);
}
}

/* The matcher that looks for `": <numerals>, ...}` and `[..., <numeral>, ...]`
*
* The pattern starts by looking for `":` not immediately preceded by a `\`. That should be
* followed by any of the numeric sub-patterns. A comma, end of an array, end of an object, or
* the end of the input are the only acceptable elements after it.
*/
const bigIntMatcher = new RegExp(
`((?:\\[|,|(?<!\\\\)"\\s*:)\\s*)(-?(?:${bigIntMatcherTokens.join('|')}))(\\s*)(?=,|}|]|$)`,
'g'
);

class Serializer {
constructor(opts = {}) {
const disable = opts.disablePrototypePoisoningProtection;
Expand All @@ -44,11 +98,173 @@ class Serializer {
};
}

/* The characters with a highly unlikely chance of occurrence in strings, alone or in combination.
*
* ToDo: When support for ancient versions of Node.js are dropped, replace with
* _bigIntMarkChars = ['෴', '߷', '֍'];
*/
get _bigIntMarkChars() {
return ['෴', '߷', '֍'];
}

/* Generates an array of all combinations of `_bigIntMarkChars` with the requested length. */
_bigIntMarkerCombinations(length = 3) {
const results = [];
const arr = this._bigIntMarkChars;
const arrLength = arr.length;
const temp = Array(length);

(function fill(pos, start) {
if (pos === length) return results.push(temp.join(''));

for (let i = start; i < arrLength; i++) {
temp[pos] = arr[i];
fill(pos + 1, i);
}
})(0, 0);

return results;
}

/* Experiments with different combinations of various lengths, until one is found to not be in
* the input string.
*/
_getSuitableBigIntMarker(json) {
let bigIntMarker;
let length = 0;
do {
length++;
this._bigIntMarkerCombinations(length).some((marker) => {
if (json.indexOf(marker) === -1) {
bigIntMarker = marker;
return true;
}
});
} while (!bigIntMarker);

return {
bigIntMarker,
length,
};
}

_parseWithBigInt(json) {
const { bigIntMarker, length } = this._getSuitableBigIntMarker(json);

let hadException;
let markedJSON = json.replace(bigIntMatcher, `$1"${bigIntMarker}$2"$3`);

/* RegExp cannot replace AST and the process of marking adds quotes. So, any false-positive hit
* will make the JSON string unparseable.
*
* To find those instances, we try to parse and watch for the location of any errors. If an error
* is caused by the marking, we remove that single marking and try again.
*/
do {
try {
hadException = false;
JSON.parse(markedJSON);
} catch (e) {
hadException = true;
/* There are two types of exception objects that can be raised:
* 1) a proper object with lineNumber and columnNumber which we can use
* 2) a textual message with the position that we need to parse
*/
let { lineNumber, columnNumber } = e;
if (!lineNumber || !columnNumber) {
const match =
// ToDo: When support for ancient versions of Node.js are dropped, replace with
// e?.message?.match?.()
e &&
e.message &&
typeof e.message.match === 'function' &&
e.message.match(/^Unexpected token.*at position (\d+)$/);
if (match) {
lineNumber = 1;
// The position is zero-indexed; adding 1 to normalize it for the -2 that comes later
columnNumber = parseInt(match[1], 10) + 1;
}
}

if (lineNumber < 1 || columnNumber < 2) {
// The problem is not with this replacement; just return a failure.
return;
}

/* We need to skip e.lineNumber - 1 number of `\n` occurrences.
* Then, we need to go to e.columnNumber - 2 to look for `"<mark>\d+"`; we need to `-1` to
* account for the quote but an additional `-1` is needed because columnNumber starts from 1.
*/
const re = new RegExp(
`^((?:.*\\n){${lineNumber - 1}}[^\\n]{${columnNumber - 2}})"${bigIntMarker}(-?\\d+)"`
);
if (!re.test(markedJSON)) {
// The exception is not caused by adding the marker
return;
}

// We have found a bad replacement; let's remove it.
markedJSON = markedJSON.replace(re, '$1$2');
}
} while (hadException);

const bigIntMarkFinder = new RegExp(`^${bigIntMarker}-?\\d+$`);

// Exceptions will trickle up to the caller
return sjson.parse(
markedJSON,
(key, val) =>
/* Convert marked values to BigInt values.
* The `startsWith` is purely for performance, to avoid running `test` if not needed.
*/
typeof val === 'string' && val.startsWith(bigIntMarker) && bigIntMarkFinder.test(val)
? BigInt(val.substring(length)) // eslint-disable-line no-undef
: val,
this[kJsonOptions]
);
}

_stringifyWithBigInt(object, candidate) {
const { bigIntMarker } = this._getSuitableBigIntMarker(candidate);

/* The matcher that looks for "<marker><numerals>"
* Because we have made sure that `bigIntMarker` was never present in the original object, we can
* carelessly assume every "<marker><numerals>" is due to our marking.
*/
const markedBigIntMatcher = new RegExp(`"${bigIntMarker}(-?\\d+)"`, 'g');

return (
JSON.stringify(
object,
/* Convert BigInt values to a string and mark them.
* Can't be bothered with Number values beyond safe values because they are already corrupted.
*/
(key, val) => (typeof val === 'bigint' ? `${bigIntMarker}${val.toString()}` : val)
)
// Replace marked substrings with just the numerals
.replace(markedBigIntMatcher, '$1')
);
}

serialize(object) {
debug('Serializing', object);
let json;
let numeralsAreNumbers = true;
const checkForBigInts = (key, val) => {
if (typeof val === 'bigint') {
numeralsAreNumbers = false;
// Number() is much faster than parseInt() on BigInt values
return Number(val);
}
return val;
};
try {
json = JSON.stringify(object);
json = JSON.stringify(object, isBigIntSupported ? checkForBigInts : null);

if (isBigIntSupported && !numeralsAreNumbers) {
const temp = this._stringifyWithBigInt(object, json);
if (temp) json = temp;
}
} catch (err) {
throw new SerializationError(err.message, object);
}
Expand All @@ -58,8 +274,31 @@ class Serializer {
deserialize(json) {
debug('Deserializing', json);
let object;
let numeralsAreNumbers = true;
const checkForLargeNumerals = (key, val) => {
if (
numeralsAreNumbers &&
typeof val === 'number' &&
(val < Number.MAX_SAFE_INTEGER || val > Number.MAX_SAFE_INTEGER)
) {
numeralsAreNumbers = false;
}

return val;
};
try {
object = sjson.parse(json, this[kJsonOptions]);
object = sjson.parse(
json,
isBigIntSupported ? checkForLargeNumerals : null,
this[kJsonOptions]
);

if (isBigIntSupported && !numeralsAreNumbers) {
const temp = this._parseWithBigInt(json);
if (temp) {
object = temp;
}
}
} catch (err) {
throw new DeserializationError(err.message, json);
}
Expand Down
3 changes: 3 additions & 0 deletions test/fixtures/longnumerals-dataset.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"number":18014398509481982,"description":"-18014398509481982 , -1 , 1 , 18014398509481982"}
{"number":-18014398509481982,"description":"෴18014398509481982"}
{"number":9007199254740891,"description":"Safer than [18014398509481982]"}
90 changes: 90 additions & 0 deletions test/integration/serializer/longnumerals.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
*/

/*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

'use strict';

const { createReadStream } = require('fs');
const { join } = require('path');
const split = require('split2');
const { test, beforeEach, afterEach } = require('tap');

const { Client } = require('../../../');

const INDEX = `test-serializer-${process.pid}`;
const client = new Client({
node: process.env.TEST_OPENSEARCH_SERVER || 'http://localhost:9200',
});

beforeEach(async () => {
await client.indices.create({ index: INDEX });
const stream = createReadStream(
join(__dirname, '..', '..', 'fixtures', 'longnumerals-dataset.ndjson')
);
const result = await client.helpers.bulk({
datasource: stream.pipe(split()),
refreshOnCompletion: true,
onDocument() {
return {
index: { _index: INDEX },
};
},
});
if (result.failed > 0) {
throw new Error('Failed bulk indexing docs');
}
});

afterEach(async () => {
await client.indices.delete({ index: INDEX }, { ignore: 404 });
});

test('long numerals', async (t) => {
const results = await client.helpers.search({
index: INDEX,
body: {
query: {
range: {
number: {
lt: 999999999999999999n,
},
},
},
},
});
t.equal(results.length, 3);
const object = {};
for (const result of results) {
object[result.description] = result.number;
}
t.same(object, {
'-18014398509481982 , -1 , 1 , 18014398509481982': 18014398509481982n,
'෴18014398509481982': -18014398509481982n,
'Safer than [18014398509481982]': 9007199254740891,
});
});
Loading

0 comments on commit e519441

Please sign in to comment.