Skip to content

Commit

Permalink
feat: parse triple backtick strings, discarding the info header (#1162)
Browse files Browse the repository at this point in the history
This is a more LLM-reliable way to get codegen outputs

We should add a documentation guide page for this (possibly a great
community contribution candidate here), but in the meantime I've drafted
up a blog post
<!-- ELLIPSIS_HIDDEN -->


----

> [!IMPORTANT]
> Add support for parsing triple backtick strings in JSON parser,
including dedenting and discarding info headers.
> 
>   - **Behavior**:
> - Add `TripleBacktickString` variant to `JsonCollection` in
`json_collection.rs` to handle triple backtick strings.
> - Update `process_token()` in `json_parse_state.rs` to parse triple
backtick strings, discarding the info header and dedenting content.
>   - **Utilities**:
> - Move `dedent` function to `dedent.rs` in `bstd` and update its usage
in `expression.rs` and `json_collection.rs`.
>   - **Tests**:
> - Add tests in `test_code.rs` to verify parsing of triple backtick
strings, including edge cases like nested backticks and dedenting.
> 
> <sup>This description was created by </sup>[<img alt="Ellipsis"
src="https://img.shields.io/badge/Ellipsis-blue?color=175173">](https://www.ellipsis.dev?ref=BoundaryML%2Fbaml&utm_source=github&utm_medium=referral)<sup>
for 3d5cd23. It will automatically
update as commits are pushed.</sup>


<!-- ELLIPSIS_HIDDEN -->
  • Loading branch information
sxlijin authored Nov 12, 2024
1 parent 3b1d152 commit 353b21e
Show file tree
Hide file tree
Showing 10 changed files with 565 additions and 102 deletions.
2 changes: 2 additions & 0 deletions engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions engine/baml-lib/jsonish/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,23 @@ unused_imports = "allow"
unused_variables = "allow"

[dependencies]
internal-baml-jinja = { path = "../jinja-runtime" }
internal-baml-core = { path = "../baml-core" }
anyhow.workspace = true
baml-types = { path = "../baml-types" }
bstd.workspace = true
colored = "2"
pest = "2.1.3"
indoc.workspace = true
internal-baml-jinja = { path = "../jinja-runtime" }
internal-baml-core = { path = "../baml-core" }
log = "0.4.20"
indexmap = "2.1.0"
strsim = "0.10.0"
serde_json.workspace = true
serde.workspace = true
# jsonschema = "0.17.1"
anyhow.workspace = true
either = "1.10.0"
test-log = "0.2.16"
regex.workspace = true

[dev-dependencies]
assert-json-diff = "2.0.2"
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use baml_types::BamlMap;
use bstd::dedent;

use crate::jsonish::Value;

Expand All @@ -10,6 +11,19 @@ pub enum JsonCollection {
QuotedString(String),
TripleQuotedString(String),
SingleQuotedString(String),
// edge cases that need handling:
// - triple backticks in a triple backtick string
// - will the LLM terminate a triple backtick with a single backtick? probably not
// - do we give the language specifier out? no
// - what if the triple backtick block contains both a lang and path specifier? e.g. ```tsx path/to/file.tsx
// should we hand back the path?
// - do we dedent the output?
// - is it an acceptable heuristic to discard the first line of a triple backtick block?
TripleBacktickString {
lang: Option<String>,
path: Option<String>,
content: String,
},
BacktickString(String),
// Handles numbers, booleans, null, and unquoted strings
UnquotedString(String),
Expand All @@ -26,6 +40,7 @@ impl JsonCollection {
JsonCollection::Array(_) => "Array",
JsonCollection::QuotedString(_) => "String",
JsonCollection::SingleQuotedString(_) => "String",
JsonCollection::TripleBacktickString { .. } => "TripleBacktickString",
JsonCollection::BacktickString(_) => "String",
JsonCollection::TripleQuotedString(_) => "TripleQuotedString",
JsonCollection::UnquotedString(_) => "UnquotedString",
Expand All @@ -51,6 +66,14 @@ impl From<JsonCollection> for Option<Value> {
JsonCollection::QuotedString(s) => Value::String(s),
JsonCollection::TripleQuotedString(s) => Value::String(s),
JsonCollection::SingleQuotedString(s) => Value::String(s),
JsonCollection::TripleBacktickString { content, .. } => {
let Some((fenced_codeblock_info, codeblock_contents)) = content.split_once("\n")
else {
return Some(Value::String(content));
};

Value::String(dedent(codeblock_contents).content)
}
JsonCollection::BacktickString(s) => Value::String(s),
JsonCollection::UnquotedString(s) => {
let s = s.trim();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,13 @@ impl JsonParseState {
| JsonCollection::BlockComment(s)
| JsonCollection::SingleQuotedString(s)
| JsonCollection::BacktickString(s)
| JsonCollection::TripleBacktickString { content: s, .. }
| JsonCollection::UnquotedString(s)
| JsonCollection::TrailingComment(s) => {
// println!("Consuming: {s} + {:?}", token);
s.push(token);
}
_ => {
JsonCollection::Object(_, _) | JsonCollection::Array(_) => {
panic!("Unexpected token: {:?} in: {:?}", token, last);
}
}
Expand Down Expand Up @@ -363,8 +364,8 @@ impl JsonParseState {
mut next: Peekable<impl Iterator<Item = (usize, char)>>,
) -> Result<usize> {
// println!("Processing: {:?}..{:?}", token, next.peek());
if let Some((last, _)) = self.collection_stack.last() {
match last {
match self.collection_stack.last() {
Some((last, _)) => match last {
JsonCollection::Object(_, _) => {
match token {
'}' => {
Expand Down Expand Up @@ -397,6 +398,10 @@ impl JsonParseState {
JsonCollection::TripleQuotedString(_) => {
// We should be expecting:
if token == '"' {
// TODO: this logic is busted. peekable.peek() does not
// advance the iterator (this is easily verified with
// a unit test), but to fix this we need to do a bit of
// refactoring, so for now we'll live with it.
let is_triple_quoted = match next.peek() {
Some((_, '"')) => match next.peek() {
Some((_, '"')) => true,
Expand Down Expand Up @@ -485,6 +490,35 @@ impl JsonParseState {
_ => self.consume(token),
}
}
JsonCollection::TripleBacktickString { .. } => {
// We could be expecting:
// - A closing backtick
// - A character
if token == '`' {
// TODO: this logic is busted. peekable.peek() does not
// advance the iterator (this is easily verified with
// a unit test), but to fix this we need to do a bit of
// refactoring, so for now we'll live with it.
let is_triple_quoted = match next.peek() {
Some((_, '`')) => match next.peek() {
Some((_, '`')) => true,
None => true,
_ => false,
},
None => true,
_ => false,
};

if is_triple_quoted {
self.complete_collection();
Ok(3)
} else {
self.consume(token)
}
} else {
self.consume(token)
}
}
JsonCollection::BacktickString(_) => {
// We could be expecting:
// - A closing backtick
Expand Down Expand Up @@ -564,13 +598,14 @@ impl JsonParseState {
_ => self.consume(token),
}
}
},
None => {
// We could be expecting:
// - A value
// - Any leading whitespace
let preview = next.peekable();
self.find_any_starting_value(token, preview)
}
} else {
// We could be expecting:
// - A value
// - Any leading whitespace
let preview = next.peekable();
self.find_any_starting_value(token, preview)
}
}

Expand Down Expand Up @@ -617,10 +652,29 @@ impl JsonParseState {
));
}
'`' => {
self.collection_stack.push((
JsonCollection::BacktickString(String::new()),
Default::default(),
));
// Peek if next 2 characters are also quotes
let is_triple_quoted = {
next.next_if(|&(_, c)| c == '`')
.and_then(|_| next.next_if(|&(_, c)| c == '`'))
.is_some()
};

if is_triple_quoted {
self.collection_stack.push((
JsonCollection::TripleBacktickString {
lang: None,
path: None,
content: String::new(),
},
Default::default(),
));
return Ok(2);
} else {
self.collection_stack.push((
JsonCollection::BacktickString(String::new()),
Default::default(),
))
}
}
'/' => {
// Could be a comment
Expand Down
25 changes: 19 additions & 6 deletions engine/baml-lib/jsonish/src/tests/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,25 @@ macro_rules! test_failing_deserializer {
};
}

/// Arguments:
/// name: name of test function to generate.
/// file_content: a BAML schema.
/// raw_string: an example payload coming from an LLM to parse.
/// target_type: The type to try to parse raw_string into.
/// json: The expected JSON encoding that the parser should return.
/// Arguments
///
/// - `name`: The name of the test function to generate.
/// - `file_content`: A BAML schema used for the test.
/// - `raw_string`: An example payload coming from an LLM to parse.
/// - `target_type`: The type to try to parse `raw_string` into.
/// - `json`: The expected JSON encoding that the parser should return.
///
/// Example
///
/// ```rust
/// test_deserializer!(
/// my_test,
/// "schema_content",
/// "raw_payload",
/// MyType,
/// { "expected": "json" }
/// );
/// ```
macro_rules! test_deserializer {
($name:ident, $file_content:expr, $raw_string:expr, $target_type:expr, $($json:tt)+) => {
#[test_log::test]
Expand Down
Loading

0 comments on commit 353b21e

Please sign in to comment.