diff --git a/CHANGELOG.md b/CHANGELOG.md index 3aa42792e..0f8027848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Unreleased] ### Added + +- Add serialization and deserialization of numerals larger than `Number.MAX_SAFE_INTEGER` ([#544](https://github.com/opensearch-project/opensearch-js/pull/544)) + ### Dependencies ### Changed diff --git a/lib/Serializer.js b/lib/Serializer.js index 06bec7199..37a9710c2 100644 --- a/lib/Serializer.js +++ b/lib/Serializer.js @@ -35,6 +35,60 @@ const sjson = require('secure-json-parse'); const { SerializationError, DeserializationError } = require('./errors'); const kJsonOptions = Symbol('secure json parse options'); +/* In JavaScript, a `Number` is a 64-bit floating-point value which can store 16 digits. However, the + * serializer and deserializer will need to cater to numeric values generated by other languages which + * can have up to 19 digits. Native JSON parser and stringifier, incapable of handling the extra + * digits, corrupt the values, making them unusable. + * + * To work around this limitation, the deserializer converts long sequences of digits into strings and + * marks them before applying the parser. During the parsing, string values that begin with the mark + * are converted to `BigInt` values. + * Similarly, during stringification, the serializer converts `BigInt` values to marked strings and + * when done, it replaces them with plain numerals. + * + * `Number.MAX_SAFE_INTEGER`, 9,007,199,254,740,991, is the largest number that the native methods can + * parse and stringify, and any numeral greater than that would need to be translated using the + * workaround; all 17-digits or longer and only tail-end of the 16-digits need translation. It would + * be unfair to all the 16-digit numbers if the translation applied to `\d{16,}` only to cover the + * less than 10%. Hence, a RegExp is created to only match numerals too long to be a number. + * + * To make the explanation simpler, let's assume that MAX_SAFE_INTEGER is 8921 which has 4 digits. + * Starting from the right, we take each digit onwards, `[-9]`: + * 1) 7922 - 7929: 792[2-9]\d{0} + * 2) 7930 - 7999: 79[3-9]\d{1} + * 9) 9 + 1 = 10 which results in a rollover; no need to do anything. + * 8) 9000 - 9999: [9-9]\d{3} + * Finally we add anything 5 digits or longer: `\d{5,} + * + * PS, a better solution would use AST but considering its performance penalty, RegExp is the next + * the best solution. + */ +const isBigIntSupported = typeof BigInt !== 'undefined'; +const maxIntAsString = String(Number.MAX_SAFE_INTEGER); +const maxIntLength = maxIntAsString.length; +// Sub-patterns for each digit +const bigIntMatcherTokens = [`\\d{${maxIntAsString.length + 1},}`]; +for (let i = 0; i < maxIntLength; i++) { + if (maxIntAsString[i] !== '9') { + bigIntMatcherTokens.push( + maxIntAsString.substring(0, i) + + `[${parseInt(maxIntAsString[i], 10) + 1}-9]` + + `\\d{${maxIntLength - i - 1}}` + ); + } +} + +/* The matcher that looks for `": , ...}` and `[..., , ...]` + * + * The pattern starts by looking for `":` not immediately preceded by a `\`. That should be + * followed by any of the numeric sub-patterns. A comma, end of an array, end of an object, or + * the end of the input are the only acceptable elements after it. + */ +const bigIntMatcher = new RegExp( + `((?:\\[|,|(? { + if (json.indexOf(marker) === -1) { + bigIntMarker = marker; + return true; + } + }); + } while (!bigIntMarker); + + return { + bigIntMarker, + length, + }; + } + + _parseWithBigInt(json) { + const { bigIntMarker, length } = this._getSuitableBigIntMarker(json); + + let hadException; + let markedJSON = json.replace(bigIntMatcher, `$1"${bigIntMarker}$2"$3`); + + /* RegExp cannot replace AST and the process of marking adds quotes. So, any false-positive hit + * will make the JSON string unparseable. + * + * To find those instances, we try to parse and watch for the location of any errors. If an error + * is caused by the marking, we remove that single marking and try again. + */ + do { + try { + hadException = false; + JSON.parse(markedJSON); + } catch (e) { + hadException = true; + /* There are two types of exception objects that can be raised: + * 1) a proper object with lineNumber and columnNumber which we can use + * 2) a textual message with the position that we need to parse + */ + let { lineNumber, columnNumber } = e; + if (!lineNumber || !columnNumber) { + const match = + // ToDo: When support for ancient versions of Node.js are dropped, replace with + // e?.message?.match?.() + e && + e.message && + typeof e.message.match === 'function' && + e.message.match(/^Unexpected token.*at position (\d+)$/); + if (match) { + lineNumber = 1; + // The position is zero-indexed; adding 1 to normalize it for the -2 that comes later + columnNumber = parseInt(match[1], 10) + 1; + } + } + + if (lineNumber < 1 || columnNumber < 2) { + // The problem is not with this replacement; just return a failure. + return; + } + + /* We need to skip e.lineNumber - 1 number of `\n` occurrences. + * Then, we need to go to e.columnNumber - 2 to look for `"\d+"`; we need to `-1` to + * account for the quote but an additional `-1` is needed because columnNumber starts from 1. + */ + const re = new RegExp( + `^((?:.*\\n){${lineNumber - 1}}[^\\n]{${columnNumber - 2}})"${bigIntMarker}(-?\\d+)"` + ); + if (!re.test(markedJSON)) { + // The exception is not caused by adding the marker + return; + } + + // We have found a bad replacement; let's remove it. + markedJSON = markedJSON.replace(re, '$1$2'); + } + } while (hadException); + + const bigIntMarkFinder = new RegExp(`^${bigIntMarker}-?\\d+$`); + + // Exceptions will trickle up to the caller + return sjson.parse( + markedJSON, + (key, val) => + /* Convert marked values to BigInt values. + * The `startsWith` is purely for performance, to avoid running `test` if not needed. + */ + typeof val === 'string' && val.startsWith(bigIntMarker) && bigIntMarkFinder.test(val) + ? BigInt(val.substring(length)) // eslint-disable-line no-undef + : val, + this[kJsonOptions] + ); + } + + _stringifyWithBigInt(object, candidate) { + const { bigIntMarker } = this._getSuitableBigIntMarker(candidate); + + /* The matcher that looks for "" + * Because we have made sure that `bigIntMarker` was never present in the original object, we can + * carelessly assume every "" is due to our marking. + */ + const markedBigIntMatcher = new RegExp(`"${bigIntMarker}(-?\\d+)"`, 'g'); + + return ( + JSON.stringify( + object, + /* Convert BigInt values to a string and mark them. + * Can't be bothered with Number values beyond safe values because they are already corrupted. + */ + (key, val) => (typeof val === 'bigint' ? `${bigIntMarker}${val.toString()}` : val) + ) + // Replace marked substrings with just the numerals + .replace(markedBigIntMatcher, '$1') + ); + } + serialize(object) { debug('Serializing', object); let json; + let numeralsAreNumbers = true; + const checkForBigInts = (key, val) => { + if (typeof val === 'bigint') { + numeralsAreNumbers = false; + // Number() is much faster than parseInt() on BigInt values + return Number(val); + } + return val; + }; try { - json = JSON.stringify(object); + json = JSON.stringify(object, isBigIntSupported ? checkForBigInts : null); + + if (isBigIntSupported && !numeralsAreNumbers) { + const temp = this._stringifyWithBigInt(object, json); + if (temp) json = temp; + } } catch (err) { throw new SerializationError(err.message, object); } @@ -58,8 +274,31 @@ class Serializer { deserialize(json) { debug('Deserializing', json); let object; + let numeralsAreNumbers = true; + const checkForLargeNumerals = (key, val) => { + if ( + numeralsAreNumbers && + typeof val === 'number' && + (val < Number.MAX_SAFE_INTEGER || val > Number.MAX_SAFE_INTEGER) + ) { + numeralsAreNumbers = false; + } + + return val; + }; try { - object = sjson.parse(json, this[kJsonOptions]); + object = sjson.parse( + json, + isBigIntSupported ? checkForLargeNumerals : null, + this[kJsonOptions] + ); + + if (isBigIntSupported && !numeralsAreNumbers) { + const temp = this._parseWithBigInt(json); + if (temp) { + object = temp; + } + } } catch (err) { throw new DeserializationError(err.message, json); } diff --git a/test/fixtures/longnumerals-dataset.ndjson b/test/fixtures/longnumerals-dataset.ndjson new file mode 100644 index 000000000..673862301 --- /dev/null +++ b/test/fixtures/longnumerals-dataset.ndjson @@ -0,0 +1,3 @@ +{"number":18014398509481982,"description":"-18014398509481982 , -1 , 1 , 18014398509481982"} +{"number":-18014398509481982,"description":"෴18014398509481982"} +{"number":9007199254740891,"description":"Safer than [18014398509481982]"} diff --git a/test/integration/serializer/longnumerals.test.js b/test/integration/serializer/longnumerals.test.js new file mode 100644 index 000000000..2e5675d2f --- /dev/null +++ b/test/integration/serializer/longnumerals.test.js @@ -0,0 +1,90 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + * + */ + +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +'use strict'; + +const { createReadStream } = require('fs'); +const { join } = require('path'); +const split = require('split2'); +const { test, beforeEach, afterEach } = require('tap'); + +const { Client } = require('../../../'); + +const INDEX = `test-serializer-${process.pid}`; +const client = new Client({ + node: process.env.TEST_OPENSEARCH_SERVER || 'http://localhost:9200', +}); + +beforeEach(async () => { + await client.indices.create({ index: INDEX }); + const stream = createReadStream( + join(__dirname, '..', '..', 'fixtures', 'longnumerals-dataset.ndjson') + ); + const result = await client.helpers.bulk({ + datasource: stream.pipe(split()), + refreshOnCompletion: true, + onDocument() { + return { + index: { _index: INDEX }, + }; + }, + }); + if (result.failed > 0) { + throw new Error('Failed bulk indexing docs'); + } +}); + +afterEach(async () => { + await client.indices.delete({ index: INDEX }, { ignore: 404 }); +}); + +test('long numerals', async (t) => { + const results = await client.helpers.search({ + index: INDEX, + body: { + query: { + range: { + number: { + lt: 999999999999999999n, + }, + }, + }, + }, + }); + t.equal(results.length, 3); + const object = {}; + for (const result of results) { + object[result.description] = result.number; + } + t.same(object, { + '-18014398509481982 , -1 , 1 , 18014398509481982': 18014398509481982n, + '෴18014398509481982': -18014398509481982n, + 'Safer than [18014398509481982]': 9007199254740891, + }); +}); diff --git a/test/unit/serializer.test.js b/test/unit/serializer.test.js index ccbb9baf2..f6c6339a7 100644 --- a/test/unit/serializer.test.js +++ b/test/unit/serializer.test.js @@ -43,6 +43,32 @@ test('Basic', (t) => { t.same(s.deserialize(json), obj); }); +test('Long numerals', (t) => { + t.plan(7); + const s = new Serializer(); + const longPositive = BigInt(Number.MAX_SAFE_INTEGER) * 2n; // eslint-disable-line no-undef + const longNegative = BigInt(Number.MIN_SAFE_INTEGER) * 2n; // eslint-disable-line no-undef + const json = + `{` + + // The space before and after the values, and the lack of spaces before comma are intentional + `"\\":${longPositive}": "[ ${longNegative.toString()}, ${longPositive.toString()} ]", ` + + `"positive": ${longPositive.toString()}, ` + + `"array": [ ${longNegative.toString()}, ${longPositive.toString()} ], ` + + `"negative": ${longNegative.toString()},` + + `"hardcoded": 102931203123987` + + `}`; + const obj = s.deserialize(json); + const res = s.serialize(obj); + t.equal(obj.positive, longPositive); + t.equal(obj.negative, longNegative); + t.same(obj.array, [longNegative, longPositive]); + // The space before and after the values, and the lack of spaces before comma are intentional + t.equal(obj['":' + longPositive], `[ ${longNegative.toString()}, ${longPositive.toString()} ]`); + t.equal(obj.hardcoded, 102931203123987); + t.equal(res.replace(/\s+/g, ''), json.replace(/\s+/g, '')); + t.match(res, `"[ ${longNegative.toString()}, ${longPositive.toString()} ]"`); +}); + test('ndserialize', (t) => { t.plan(1); const s = new Serializer();