From 8e708732978eb6d392d02bea97177c824e73822d Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Sat, 10 Feb 2024 20:13:34 -0600 Subject: [PATCH] Add the levenshtein string similarity algorithm --- CHANGELOG.md | 2 +- Cargo.toml | 1 + Dockerfile | 4 +- README.md | 222 +++++++++++++++++++++++----- test-integration/tests/stringops.rs | 137 +++++++++++++++++ udf-stringops/Cargo.toml | 13 ++ udf-stringops/src/lib.rs | 126 ++++++++++++++++ udf-uuid/README.md | 2 +- 8 files changed, 464 insertions(+), 43 deletions(-) create mode 100644 test-integration/tests/stringops.rs create mode 100644 udf-stringops/Cargo.toml create mode 100644 udf-stringops/src/lib.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index f98c859..eb268b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ### Added -### Changed +Add the `levenshtein` string distance algorithm. ## [0.1.10] - 2023-10-05 diff --git a/Cargo.toml b/Cargo.toml index 435e190..019a832 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "udf-lipsum", "udf-uuid", "test-integration", + "udf-stringops", ] [profile.release] diff --git a/Dockerfile b/Dockerfile index d8bd080..434c45a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,9 +9,7 @@ # docker exec -it mdb-udf-suite-c mariadb -pexample # ``` -FROM rust:latest AS build - -ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse +FROM rust:1.76 AS build WORKDIR /build diff --git a/README.md b/README.md index d6921d8..f23454d 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,13 @@ The following UDFs are includes: `xxhash` - [IP Functions](#ip-address-functions) for interop: `ip_validate`, `ip_to_canonical`, `ip_to_ipv4_mapped` +- [String Operations](#string-operations): Calculations such as Levenshtein + edit distance, including limited and normalized versions. - [Jsonify](#jsonify): convert any data to JSON - [Lipsum](#lipsum): generate random text +See the relevant section for more information. + ### UUID Provide UUID functions similar to the Postges [`uuid-osp`] package: @@ -34,6 +38,29 @@ Provide UUID functions similar to the Postges [`uuid-osp`] package: See the [UUID Readme](/udf-uuid/README.md) for more information +#### Usage + +```text +note: type uuid is type string +uuid_generate_v1() -> uuid +uuid_generate_v1mc() -> uuid +uuid_generate_v4() -> uuid +uuid_generate_v6([node_addr: string]) -> uuid +uuid_generate_v7() -> uuid +uuid_nil() -> uuid +uuid_max() -> uuid +uuid_ns_dns() -> string +uuid_ns_url() -> string +uuid_ns_oid() -> string +uuid_ns_x500() -> string +uuid_is_valid(uuid: uuid) -> boolean +uuid_to_bin(uuid: uuid) -> bytes +uuid_from_bin() -> uuid +bin_from_uuid() -> uuid +``` + +#### Examples + ```text MariaDB [(none)]> select uuid_generate_v6(); +--------------------------------------+ @@ -54,7 +81,7 @@ MariaDB [(none)]> select hex(uuid_to_bin(uuid_generate_v4())); [`uuid-osp`]: https://www.postgresql.org/docs/current/uuid-ossp.html -## Hash Algorithms +### Hash Algorithms This library provides the following functions: @@ -68,11 +95,54 @@ This library provides the following functions: - `xxhash3`, `xxhash32`, `xxhash64`, `xxhash` (`xxhash` is an alias for `xxhash64`) -All of these return hex strings by defaulti. `_bin` functions are also +All of these return hex strings by default. `_bin` functions are also provided that return the binary result without going through hexification, suitable for storage in a `BINARY(X)` column. +#### Usage + +```text +blake2b512(a: any [, ...]) -> string +blake2b512_bin(a: any [, ...]) -> bytes +blake2s512(a: any [, ...]) -> string +blake2s512_bin(a: any [, ...]) -> bytes +blake3(a: any [, ...]) -> string +blake3_bin(a: any [, ...]) -> bytes +blake3_thd(a: any [, ...]) -> string +blake3_thd_bin(a: any [, ...]) -> bytes +md5_u(a: any [, ...]) -> string +md5_u_bin(a: any [, ...]) -> bytes +sha1_u(a: any [, ...]) -> string +sha1_u_bin(a: any [, ...]) -> bytes +sha224(a: any [, ...]) -> string +sha224_bin(a: any [, ...]) -> bytes +sha256(a: any [, ...]) -> string +sha256_bin(a: any [, ...]) -> bytes +sha384(a: any [, ...]) -> string +sha384_bin(a: any [, ...]) -> bytes +sha512(a: any [, ...]) -> string +sha512_bin(a: any [, ...]) -> bytes +keccak224(a: any [, ...]) -> string +keccak224_bin(a: any [, ...]) -> bytes +keccak256(a: any [, ...]) -> string +keccak256_bin(a: any [, ...]) -> bytes +sha3_224(a: any [, ...]) -> string +sha3_224_bin(a: any [, ...]) -> bytes +sha3_256(a: any [, ...]) -> string +sha3_256_bin(a: any [, ...]) -> bytes +sha3_384(a: any [, ...]) -> string +sha3_384_bin(a: any [, ...]) -> bytes +sha3_512(a: any [, ...]) -> string +sha3_512_bin(a: any [, ...]) -> bytes +xxhash(a: any [, ...]) -> integer +xxhash3(a: any [, ...]) -> integer +xxhash32(a: any [, ...]) -> integer +xxhash64(a: any [, ...]) -> integer +``` + +#### Examples + ```text MariaDB [(none)]> select blake3("Hello, world!"); +------------------------------------------------------------------+ @@ -114,15 +184,67 @@ MariaDB [(none)]> select xxhash('Hello, ', 0x77, 'orld', '!'); Note that in SQL, all integers are an `i64`, all floats are a `f64`, and all decimals are represented as a string to the UDF API. This library hashes these -types as their little endian representation. (You only need to worry about this -if you have very obscure platform compatibility requirements, and strings and -blobs are always unambiguous). +types as their little endian representation on all platforms. (You only need +to worry about this if you have very obscure platform compatibility +requirements. Strings and blobs are always unambiguous). + +### String Operationg + +Provide the function `levenshtein`, which calculates the levenshtein edit +distance between two strings. There is also `levenshtein_normalized` that +returns a value between 0.0 (identical) and 1.0 (significantly different). + +If a limit is provided as a third argument, the operation will terminate if +that limit is exceeded. This can help to improve performance if filtering +dissimilar strings. + +These algorithms provide a _byte_ edit distance, rather than unicode chars or +graphemes. These options may be added in the future. + +These algorithms are implemented by the [`rapidfuzz`] crate. + +[`rapidfuzz`]: https://crates.io/crates/rapidfuzz) + +#### Usage + +```text +levenshtein(a: str, b: str [, limit: integer]) -> integer; +levenshtein_normalized(a: str, b: str [, limit: real]) -> real; +``` + +#### Example + +```text +MariaDB [(none)]> SELECT levenshtein('foo', 'moose'), levenshtein_normalized('foo', 'moos'); ++-----------------------------+---------------------------------------+ +| levenshtein('foo', 'moose') | levenshtein_normalized('foo', 'moos') | ++-----------------------------+---------------------------------------+ +| 3 | 0.5 | ++-----------------------------+---------------------------------------+ +1 row in set (0.001 sec) + +MariaDB [(none)]> SELECT levenshtein('foo', 'moose', 2), levenshtein_normalized('foo', 'moos', 0.3); ++--------------------------------+--------------------------------------------+ +| levenshtein('foo', 'moose', 2) | levenshtein_normalized('foo', 'moos', 0.3) | ++--------------------------------+--------------------------------------------+ +| 2 | 0.3 | ++--------------------------------+--------------------------------------------+ +1 row in set (0.001 sec) +``` ### Jsonify Provide the function `jsonify`, which quickly creates JSON output for any given inputs. +#### Usage + +```text +jsonify(a: any [, ...]) -> string +``` + +#### Examples + ```text MariaDB [db]> select jsonify(qty, cost, class) from t1 limit 4; +-------------------------------------+ @@ -155,6 +277,14 @@ MariaDB [db]> select jsonify(uuid() as uuid, qty as quantity, cost) from t1 limi Uses the [lipsum crate] to generate lipsum strings with a specified word count. +#### Usage + +```text +lipsum(count: integer [, seed: integer]) -> string +``` + +#### Examples + ```text MariaDB [(none)]> select lipsum(10); @@ -168,7 +298,7 @@ MariaDB [(none)]> select lipsum(10); [lipsum crate]: https://docs.rs/lipsum/latest/lipsum/ -## IP Address Functions +### IP Address Functions We provide three IP functions: @@ -177,7 +307,18 @@ We provide three IP functions: - `ip_to_ipv6_mapped` which converts ipv4 addresses to their ipv6 form (e.g. for interop with the `INET6` data type) - `ip_to_canonical` which reverses the mapping operation + +#### Usage + +```text +ip_validate(ip: string) -> string +ip_to_canonical(ip: string) -> string +ip_to_ipv6_mapped(ip: string) -> string ``` + +#### Examples + +```text MariaDB [db]> select -> input, -> ip_validate(input), @@ -205,11 +346,12 @@ The desired files can be copied to the plugin directory (usually `/usr/lib/mysql/plugin`) and selectively loaded: ```sql +-- **** Hash functions **** CREATE OR REPLACE FUNCTION blake2b512 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION blake2s256 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION blake3 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION blake3_thd RETURNS string SONAME 'libudf_hash.so'; --- the md5 and sha functions have builtin versions +-- the md5 and sha functions have builtin versions, hence the `_u` suffix CREATE OR REPLACE FUNCTION md5_u RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha1_u RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha224 RETURNS string SONAME 'libudf_hash.so'; @@ -221,7 +363,6 @@ CREATE OR REPLACE FUNCTION keccak256 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_224 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_256 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_384 RETURNS string SONAME 'libudf_hash.so'; -CREATE OR REPLACE FUNCTION sha3_384_bin RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_512 RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION xxhash RETURNS integer SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION xxhash3 RETURNS integer SONAME 'libudf_hash.so'; @@ -245,36 +386,41 @@ CREATE OR REPLACE FUNCTION keccak224_bin RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION keccak256_bin RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_224_bin RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_256_bin RETURNS string SONAME 'libudf_hash.so'; +CREATE OR REPLACE FUNCTION sha3_384_bin RETURNS string SONAME 'libudf_hash.so'; CREATE OR REPLACE FUNCTION sha3_512_bin RETURNS string SONAME 'libudf_hash.so'; --- JSON creation function -CREATE FUNCTION jsonify RETURNS string SONAME 'libudf_jsonify.so'; - --- IP functions -CREATE FUNCTION ip_validate RETURNS string SONAME 'libudf_net.so'; -CREATE FUNCTION ip_to_canonical RETURNS string SONAME 'libudf_net.so'; -CREATE FUNCTION ip_to_ipv6_mapped RETURNS string SONAME 'libudf_net.so'; - --- random string generation -CREATE FUNCTION lipsum RETURNS string SONAME 'libudf_lipsum.so'; - --- UUID interfaces -CREATE FUNCTION uuid_generate_v1 RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_generate_v1mc RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_generate_v4 RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_generate_v6 RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_generate_v7 RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_nil RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_max RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_ns_dns RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_ns_url RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_ns_oid RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_ns_x500 RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_is_valid RETURNS integer SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_to_bin RETURNS string SONAME 'libudf_uuid.so'; -CREATE FUNCTION uuid_from_bin RETURNS string SONAME 'libudf_uuid.so'; +-- **** JSON creation function **** +CREATE OR REPLACE FUNCTION jsonify RETURNS string SONAME 'libudf_jsonify.so'; + +-- **** IP functions **** +CREATE OR REPLACE FUNCTION ip_validate RETURNS string SONAME 'libudf_net.so'; +CREATE OR REPLACE FUNCTION ip_to_canonical RETURNS string SONAME 'libudf_net.so'; +CREATE OR REPLACE FUNCTION ip_to_ipv6_mapped RETURNS string SONAME 'libudf_net.so'; + +-- **** string operation functions **** +CREATE OR REPLACE FUNCTION levenshtein RETURNS integer SONAME 'libudf_stringops.so' +CREATE OR REPLACE FUNCTION levenshtein_normalized RETURNS real SONAME 'libudf_stringops.so' + +-- **** random string generation **** +CREATE OR REPLACE FUNCTION lipsum RETURNS string SONAME 'libudf_lipsum.so'; + +-- **** UUID interfaces **** +CREATE OR REPLACE FUNCTION uuid_generate_v1 RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_generate_v1mc RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_generate_v4 RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_generate_v6 RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_generate_v7 RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_nil RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_max RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_ns_dns RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_ns_url RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_ns_oid RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_ns_x500 RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_is_valid RETURNS integer SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_to_bin RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION uuid_from_bin RETURNS string SONAME 'libudf_uuid.so'; -- `bin_to_uuid` and 'uuid_from_bin' are aliases -CREATE FUNCTION bin_to_uuid RETURNS string SONAME 'libudf_uuid.so'; +CREATE OR REPLACE FUNCTION bin_to_uuid RETURNS string SONAME 'libudf_uuid.so'; ``` Note that Windows `.dll`s are built but have not been tested - please open an @@ -305,14 +451,14 @@ docker build . --tag mdb-udf-suite-img # run it in the background docker run --rm -d \ -e MARIADB_ROOT_PASSWORD=example \ - --name mdb_udf_suite \ + --name mdb-udf-suite \ mdb-udf-suite-img # Enter a SQL shell -docker exec -it mdb_udf_suite mariadb -pexample +docker exec -it mdb-udf-suite mariadb -pexample # Stop the server when done -docker stop mdb_udf_suite +docker stop mdb-udf-suite ``` The UDFs can then be loaded using the `CREATE FUNCTION` statements above. diff --git a/test-integration/tests/stringops.rs b/test-integration/tests/stringops.rs new file mode 100644 index 0000000..9d016e0 --- /dev/null +++ b/test-integration/tests/stringops.rs @@ -0,0 +1,137 @@ +#![cfg(feature = "backend")] + +mod backend; + +use std::assert_eq; +use std::cmp::min; + +use backend::get_db_connection; +use mysql::prelude::*; + +const SETUP: &[&str] = &[ + "CREATE OR REPLACE FUNCTION levenshtein RETURNS integer + SONAME 'libudf_stringops.so'", + "CREATE OR REPLACE FUNCTION levenshtein_normalized RETURNS real + SONAME 'libudf_stringops.so'", +]; + +/// `(a, b, result)` +const TESTS: &[((&str, &str), i64)] = &[ + (("abcd", "abcd"), 0), + (("abcd", "ab"), 2), + (("ab", "abcd"), 2), + (("abcd", "ad"), 2), + (("abcd", "cd"), 2), + (("abcd", "a"), 3), + (("abcd", "c"), 3), + (("abcd", "accd"), 1), + (("kitten", "sitting"), 3), + (("sitting", "kitten"), 3), + (("not", "to a"), 3), + (("to be a bee", "not to bee"), 6), +]; + +/// `(a, b, limit, result)` +const TESTS_LIMIT: &[((&str, &str, u32), i64)] = &[ + (("abcd", "abcd", 1), 0), + (("abcdef", "", 3), 3), + (("", "abcdef", 3), 3), + (("abcdef", "", 8), 6), + (("", "abcdef", 8), 6), + (("abcdef", "000000", 3), 3), + (("ab", "0000", 3), 3), +]; + +/// `(a, b, result)` +const TESTS_NORMALIZED: &[((&str, &str), f64)] = &[ + (("abcd", "abcd"), 0.0), + (("abcd", "ab"), 0.5), + (("ab", "abcd"), 0.5), + (("abcd", "ad"), 0.5), + (("abcd", "cd"), 0.5), + (("abcd", "a"), 0.75), + (("abcd", "c"), 0.75), + (("abcd", "accd"), 0.25), + (("kitten", "sitting"), 0.42), + (("sitting", "kitten"), 0.42), + (("not", "to a"), 0.75), + (("to be a bee", "not to bee"), 0.54), +]; + +/// `(a, b, limit, result)` +const TESTS_NORMALIZED_LIMIT: &[((&str, &str, f64), f64)] = &[ + (("abcd", "abcd", 1.0), 0.0), + (("abcdef", "", 0.2), 0.2), + (("", "abcdef", 0.2), 0.2), + (("abcdef", "", 0.6), 0.6), + (("", "abcdef", 0.6), 0.6), + (("abcdef", "000000", 0.5), 0.5), + (("ab", "0000", 0.5), 0.5), +]; + +#[test] +fn test_levenshtein() { + let conn = &mut get_db_connection(SETUP); + + for (params, expected) in TESTS { + let res: i64 = conn + .exec_first("select levenshtein(?, ?)", params) + .unwrap() + .unwrap(); + + assert_eq!(res, *expected, "params {params:?} -> {expected} failed"); + } +} + +#[test] +fn test_levenshtein_limit() { + let conn = &mut get_db_connection(SETUP); + + for (params, expected) in TESTS_LIMIT { + let res: i64 = conn + .exec_first("select levenshtein(?, ?, ?)", params) + .unwrap() + .unwrap(); + + assert_eq!(res, *expected, "params {params:?} -> {expected}failed"); + } +} + +#[test] +fn test_levenshtein_normalized() { + let conn = &mut get_db_connection(SETUP); + + for (params, expected) in TESTS_NORMALIZED { + let res: f64 = conn + .exec_first("select levenshtein_normalized(?, ?)", params) + .unwrap() + .unwrap(); + + assert!( + approx_eq(res, *expected), + "params {params:?} -> {expected} failed: {res}" + ); + } +} + +#[test] +fn test_levenshtein_normalized_limit() { + let conn = &mut get_db_connection(SETUP); + + for (params, expected) in TESTS_NORMALIZED_LIMIT { + let res: f64 = conn + .exec_first("select levenshtein_normalized(?, ?, ?)", params) + .unwrap() + .unwrap(); + + assert!( + approx_eq(res, *expected), + "params {params:?} -> {expected} failed {res}" + ); + } +} + +fn approx_eq(a: f64, b: f64) -> bool { + const EPSILON: f64 = 0.01; + (a - b).abs() < EPSILON +} diff --git a/udf-stringops/Cargo.toml b/udf-stringops/Cargo.toml new file mode 100644 index 0000000..5270283 --- /dev/null +++ b/udf-stringops/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "udf-stringops" +version = "0.1.10" +edition = "2021" +publish = false +license = "Apache-2.0 OR GPL-2.0-or-later" + +[lib] +crate-type = ["cdylib"] + +[dependencies] +rapidfuzz = "0.5.0" +udf = { version = "0.5.4", features = ["mock"] } diff --git a/udf-stringops/src/lib.rs b/udf-stringops/src/lib.rs new file mode 100644 index 0000000..71a9c47 --- /dev/null +++ b/udf-stringops/src/lib.rs @@ -0,0 +1,126 @@ +//! A function to calculate the levenshtein distance between two strings. +//! +//! If a third argument is provided, the value returned by this function will not exceed +//! that limit. +//! +//! # Usage +//! +//! ```sql +//! CREATE FUNCTION levenshtein RETURNS integer SONAME 'libudf_stringops.so'; +//! CREATE FUNCTION levenshtein_normalized RETURNS real SONAME 'libudf_stringops.so'; +//! ``` + +use rapidfuzz::distance::levenshtein; +use udf::prelude::*; + +const I32_MAX: usize = i32::MAX as usize; + +struct Levenshtein; + +#[register(name = "levenshtein")] +impl BasicUdf for Levenshtein { + type Returns<'a> = i64; + + fn init(_cfg: &UdfCfg, args: &ArgList) -> Result { + init_check_args(args, SqlType::Int)?; + Ok(Self) + } + + fn process<'a>( + &'a mut self, + _cfg: &UdfCfg, + args: &ArgList, + _error: Option, + ) -> Result, ProcessError> { + // Unwraps are OK because we set coercions already + let a_arg = args.get(0).unwrap().value(); + let b_arg = args.get(1).unwrap().value(); + let a = a_arg.as_string().unwrap(); + let b = b_arg.as_string().unwrap(); + + if a.len() > I32_MAX || b.len() > I32_MAX { + return Err(ProcessError); + } + + let res = match args.get(2) { + Some(arg) => { + let limit_i64 = arg.value().as_int().unwrap().clamp(0, i64::MAX); + let limit = usize::try_from(limit_i64).unwrap(); + let args = levenshtein::Args::default().score_cutoff(limit); + levenshtein::distance_with_args(a.bytes(), b.bytes(), &args).unwrap_or(limit) + } + None => levenshtein::distance(a.bytes(), b.bytes()), + }; + + Ok(res.try_into().unwrap()) + } +} + +struct LevenshteinNormalized; + +#[register(name = "levenshtein_normalized")] +impl BasicUdf for LevenshteinNormalized { + type Returns<'a> = f64; + + fn init(_cfg: &UdfCfg, args: &ArgList) -> Result { + init_check_args(args, SqlType::Real)?; + Ok(Self) + } + + fn process<'a>( + &'a mut self, + _cfg: &UdfCfg, + args: &ArgList, + _error: Option, + ) -> Result, ProcessError> { + // Unwraps are OK because we set coercions already + let a_arg = args.get(0).unwrap().value(); + let b_arg = args.get(1).unwrap().value(); + let a = a_arg.as_string().unwrap(); + let b = b_arg.as_string().unwrap(); + + if a.len() > I32_MAX || b.len() > I32_MAX { + return Err(ProcessError); + } + + let res = match args.get(2) { + Some(arg) => { + let limit = arg.value().as_real().unwrap().clamp(0.0, 1.0); + let args = levenshtein::Args::default().score_cutoff(limit); + levenshtein::normalized_distance_with_args(a.bytes(), b.bytes(), &args) + .unwrap_or(limit) + } + None => levenshtein::normalized_distance(a.bytes(), b.bytes()), + }; + + Ok(res) + } +} + +/// Perform arg checks needed for initialization +fn init_check_args(args: &ArgList, limit_coercion: SqlType) -> Result<(), String> { + // Lazy error message generation + let make_emsg = || { + format!( + "usage: levenshtein(a: str, b: str [, limit: int]). Got {} args", + args.len() + ) + }; + + let (Some(mut a_arg), Some(mut b_arg)) = (args.get(0), args.get(1)) else { + return Err(make_emsg()); + }; + + if args.len() > 3 { + return Err(make_emsg()); + } + + a_arg.set_type_coercion(SqlType::String); + b_arg.set_type_coercion(SqlType::String); + + if let Some(mut limit_arg) = args.get(2) { + limit_arg.set_type_coercion(limit_coercion); + } + + Ok(()) +} diff --git a/udf-uuid/README.md b/udf-uuid/README.md index a4fa8d3..a57c834 100644 --- a/udf-uuid/README.md +++ b/udf-uuid/README.md @@ -111,7 +111,7 @@ SELECT uuid_generate_v6(); SELECT uuid_generate_v6('123abc'); SELECT uuid_generate_v7(); --- UUID constants +-- UUID constants and namespaces SELECT uuid_nil(); SELECT uuid_max(); SELECT uuid_ns_dns();