Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ rusqlite = { version = "0.31.0", features = ["blob", "serde_json", "i128_blob",
thiserror = "1.0.65"
tikv-jemallocator = "0.5.4"
toml = "0.5.6"
criterion = "0.8.2"

# Use a bit more than default optimization for
# dev builds to speed up test execution
Expand Down
2 changes: 2 additions & 0 deletions changelog.d/6948-utf8char-fixed-array-repr.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Replaced `Utf8Data` internal representation from `Vec<Vec<u8>>` to `Vec<Utf8Char>` (`[u8; 4]`), reducing heap allocations and speeding up clones.
Made `Utf8Char` inner field private with validation on deserialization
6 changes: 6 additions & 0 deletions clarity-types/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@ slog = { workspace = true }
stacks_common = { package = "stacks-common", path = "../stacks-common", default-features = false }

[dev-dependencies]
criterion = { workspace = true}
mutants = "0.0.3"
rstest = "0.17.0"
serde_json = "1.0"

[[bench]]
name = "utf8_data"
harness = false

[features]
default = []
Expand Down
144 changes: 144 additions & 0 deletions clarity-types/benches/utf8_data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (C) 2026 Stacks Open Internet Foundation
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::hint::black_box;

use criterion::{Criterion, criterion_group, criterion_main};

// --- Helpers to build comparable data structures ---

/// New representation: Vec<Utf8Char>
fn make_new_ascii(n: usize) -> Vec<char> {
(0..n).map(|_| 'A').collect()
}

fn make_new_multibyte(n: usize) -> Vec<char> {
// U+2603 snowman = 0xE2 0x98 0x83 (3 bytes)
(0..n).map(|_| '\u{2603}').collect()
}

/// Old representation: Vec<Vec<u8>>
fn make_old_ascii(n: usize) -> Vec<Vec<u8>> {
(0..n).map(|_| vec![b'A']).collect()
}

fn make_old_multibyte(n: usize) -> Vec<Vec<u8>> {
(0..n).map(|_| vec![0xE2, 0x98, 0x83]).collect()
}

fn bench_utf8(c: &mut Criterion) {
let mut group = c.benchmark_group("utf8_data");

for size in [100, 1000] {
let new_ascii = make_new_ascii(size);
let old_ascii = make_old_ascii(size);
let new_multi = make_new_multibyte(size);
let old_multi = make_old_multibyte(size);

group.bench_function(format!("new_clone_ascii_{size}"), |b| {
b.iter(|| black_box(new_ascii.clone()));
});
group.bench_function(format!("old_clone_ascii_{size}"), |b| {
b.iter(|| black_box(old_ascii.clone()));
});

group.bench_function(format!("new_clone_multibyte_{size}"), |b| {
b.iter(|| black_box(new_multi.clone()));
});
group.bench_function(format!("old_clone_multibyte_{size}"), |b| {
b.iter(|| black_box(old_multi.clone()));
});

// --- Construction: raw data structure comparison ---
group.bench_function(format!("new_construct_ascii_{size}"), |b| {
b.iter(|| {
black_box((0..size).map(|_| 'A').collect::<Vec<_>>());
});
});
group.bench_function(format!("old_construct_ascii_{size}"), |b| {
b.iter(|| {
black_box((0..size).map(|_| vec![b'A']).collect::<Vec<Vec<u8>>>());
});
});

group.bench_function(format!("new_construct_multibyte_{size}"), |b| {
b.iter(|| {
black_box((0..size).map(|_| '\u{2603}').collect::<Vec<_>>());
});
});
group.bench_function(format!("old_construct_multibyte_{size}"), |b| {
b.iter(|| {
black_box(
(0..size)
.map(|_| vec![0xE2u8, 0x98, 0x83])
.collect::<Vec<Vec<u8>>>(),
);
});
});

// --- Full bytes→data pipeline (end-to-end) ---
// Both paths: validate UTF-8 → decode chars → collect.
// New: Vec<Utf8Char> (stack-allocated per char).
// Old: Vec<Vec<u8>> (heap-allocated per char).
let ascii_bytes: Vec<u8> = "A".repeat(size).into_bytes();
let multi_bytes: Vec<u8> = "\u{2603}".repeat(size).into_bytes();

group.bench_function(format!("new_value_construct_ascii_{size}"), |b| {
b.iter(|| {
let s = std::str::from_utf8(&ascii_bytes).unwrap();
black_box(s.chars().collect::<Vec<_>>());
});
});
group.bench_function(format!("old_value_construct_ascii_{size}"), |b| {
b.iter(|| {
let s = std::str::from_utf8(&ascii_bytes).unwrap();
black_box(
s.chars()
.map(|c| {
let mut buf = vec![0u8; c.len_utf8()];
c.encode_utf8(&mut buf);
buf
})
.collect::<Vec<Vec<u8>>>(),
);
});
});

group.bench_function(format!("new_value_construct_multibyte_{size}"), |b| {
b.iter(|| {
let s = std::str::from_utf8(&multi_bytes).unwrap();
black_box(s.chars().collect::<Vec<_>>());
});
});
group.bench_function(format!("old_value_construct_multibyte_{size}"), |b| {
b.iter(|| {
let s = std::str::from_utf8(&multi_bytes).unwrap();
black_box(
s.chars()
.map(|c| {
let mut buf = vec![0u8; c.len_utf8()];
c.encode_utf8(&mut buf);
buf
})
.collect::<Vec<Vec<u8>>>(),
);
});
});
}

group.finish();
}

criterion_group!(benches, bench_utf8);
criterion_main!(benches);
112 changes: 105 additions & 7 deletions clarity-types/src/tests/types/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,6 @@ fn test_ascii_data_to_value_returns_clarity_type_error() {
assert_eq!(ClarityTypeError::InvalidAsciiCharacter(1), err);
}

#[test]
fn test_utf8_data_to_value_returns_clarity_types_error_invalid_utf8_encoding() {
let err = UTF8Data::to_value(&vec![0xED, 0xA0, 0x80]).unwrap_err();
assert_eq!(ClarityTypeError::InvalidUtf8Encoding, err);
}

#[test]
fn test_tuple_data_from_data_typed_returns_clarity_type_error() {
let tuple_type =
Expand Down Expand Up @@ -587,7 +581,7 @@ fn test_ascii_data_len_returns_clarity_type_error() {
#[test]
fn test_utf8_data_len_returns_clarity_type_error() {
let err = UTF8Data {
data: vec![vec![]; MAX_VALUE_SIZE as usize + 1],
data: vec!['\0'; MAX_VALUE_SIZE as usize + 1],
}
.len()
.unwrap_err();
Expand Down Expand Up @@ -627,3 +621,107 @@ fn invalid_utf8_string_from_bytes() {

assert!(matches!(err, ClarityTypeError::InvalidUtf8Encoding));
}

#[test]
fn utf8data_serde_roundtrip_ascii() {
let data = UTF8Data {
data: vec!['H', 'i'],
};
let json = serde_json::to_string(&data).unwrap();
let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
assert_eq!(data, deserialized);
}

#[test]
fn utf8data_serde_roundtrip_multibyte() {
// Snowman U+2603
let data = UTF8Data {
data: vec!['\u{2603}'],
};
let json = serde_json::to_string(&data).unwrap();
let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
assert_eq!(data, deserialized);
}

#[test]
fn utf8data_serde_roundtrip_four_byte() {
// Grinning face U+1F600
let data = UTF8Data {
data: vec!['\u{1F600}'],
};
let json = serde_json::to_string(&data).unwrap();
let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
assert_eq!(data, deserialized);
}

#[test]
fn utf8data_serde_roundtrip_mixed() {
// "A" + snowman + grinning face
let data = UTF8Data {
data: vec!['A', '\u{2603}', '\u{1F600}'],
};
let json = serde_json::to_string(&data).unwrap();
let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
assert_eq!(data, deserialized);
}

#[test]
fn utf8data_serializes_only_significant_bytes() {
// ASCII 'A' should serialize as [65], not [65, 0, 0, 0]
let data = UTF8Data { data: vec!['A'] };
let json = serde_json::to_string(&data).unwrap();
assert_eq!(json, "[[65]]");
}

#[test]
fn utf8data_serializes_multibyte_significant_bytes() {
// Snowman should serialize as [226, 152, 131], not [226, 152, 131, 0]
let data = UTF8Data {
data: vec!['\u{2603}'],
};
let json = serde_json::to_string(&data).unwrap();
assert_eq!(json, "[[226,152,131]]");
}

#[test]
fn utf8data_deserializes_from_old_format() {
// Old format: Vec<Vec<u8>> — ensure backward compatibility
let old_json = "[[65],[226,152,131],[240,159,152,128]]";
let deserialized: UTF8Data = serde_json::from_str(old_json).unwrap();
assert_eq!(
deserialized,
UTF8Data {
data: vec!['A', '\u{2603}', '\u{1F600}',],
}
);
}

#[test]
fn utf8data_serde_roundtrip_empty() {
let data = UTF8Data { data: vec![] };
let json = serde_json::to_string(&data).unwrap();
assert_eq!(json, "[]");
let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
assert_eq!(data, deserialized);
}

#[test]
fn utf8data_deserialize_rejects_invalid_utf8() {
// 0xFF is not a valid UTF-8 leading byte
let json = "[[255]]";
serde_json::from_str::<UTF8Data>(json).unwrap_err();
}

#[test]
fn utf8data_deserialize_rejects_empty_entry() {
// An empty byte array doesn't represent a character
let json = "[[]]";
serde_json::from_str::<UTF8Data>(json).unwrap_err();
}

#[test]
fn utf8data_deserialize_rejects_multi_codepoint_entry() {
// Two ASCII chars in a single entry
let json = "[[65, 66]]";
serde_json::from_str::<UTF8Data>(json).unwrap_err();
}
8 changes: 4 additions & 4 deletions clarity-types/src/tests/types/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,25 +416,25 @@ fn test_principals() {
}

#[test]
fn test_serialize_to_vec_returns_serialization_failure() {
fn test_serialize_to_vec_returns_serialization_error_bad_type_error() {
let value = Value::Sequence(SequenceData::String(CharType::ASCII(ASCIIData {
data: vec![0; MAX_VALUE_SIZE as usize + 1],
})));
let err = value.serialize_to_vec().unwrap_err();
assert_eq!(
SerializationError::SerializationFailure(ClarityTypeError::ValueTooLarge.to_string()),
SerializationError::BadTypeError(ClarityTypeError::ValueTooLarge),
err
);
}

#[test]
fn test_serialize_to_hex_returns_serialization_failure() {
fn test_serialize_to_hex_returns_serialization_error_bad_type_error() {
let value = Value::Sequence(SequenceData::String(CharType::ASCII(ASCIIData {
data: vec![0; MAX_VALUE_SIZE as usize + 1],
})));
let err = value.serialize_to_hex().unwrap_err();
assert_eq!(
SerializationError::SerializationFailure(ClarityTypeError::ValueTooLarge.to_string()),
SerializationError::BadTypeError(ClarityTypeError::ValueTooLarge),
err
);
}
Loading
Loading