stacks-network · benjamin-stacks · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,6 +35,7 @@ rusqlite = { version = "0.31.0", features = ["blob", "serde_json", "i128_blob",
 thiserror = "1.0.65"
 tikv-jemallocator = "0.5.4"
 toml = "0.5.6"
+criterion = "0.8.2"
 
 # Use a bit more than default optimization for
 # dev builds to speed up test execution

diff --git a/changelog.d/6948-utf8char-fixed-array-repr.changed b/changelog.d/6948-utf8char-fixed-array-repr.changed
@@ -0,0 +1,2 @@
+Replaced `Utf8Data` internal representation from `Vec<Vec<u8>>` to `Vec<Utf8Char>` (`[u8; 4]`), reducing heap allocations and speeding up clones.
+Made `Utf8Char` inner field private with validation on deserialization
diff --git a/clarity-types/Cargo.toml b/clarity-types/Cargo.toml
@@ -20,8 +20,14 @@ slog = { workspace = true }
 stacks_common = { package = "stacks-common", path = "../stacks-common", default-features = false }
 
 [dev-dependencies]
+criterion = { workspace = true}
 mutants = "0.0.3"
 rstest = "0.17.0"
+serde_json = "1.0"
+
+[[bench]]
+name = "utf8_data"
+harness = false
 
 [features]
 default = []

diff --git a/clarity-types/benches/utf8_data.rs b/clarity-types/benches/utf8_data.rs
@@ -0,0 +1,144 @@
+// Copyright (C) 2026 Stacks Open Internet Foundation
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+use std::hint::black_box;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+
+// --- Helpers to build comparable data structures ---
+
+/// New representation: Vec<Utf8Char>
+fn make_new_ascii(n: usize) -> Vec<char> {
+ (0..n).map(|_| 'A').collect()
+}
+
+fn make_new_multibyte(n: usize) -> Vec<char> {
+ // U+2603 snowman = 0xE2 0x98 0x83 (3 bytes)
+ (0..n).map(|_| '\u{2603}').collect()
+}
+
+/// Old representation: Vec<Vec<u8>>
+fn make_old_ascii(n: usize) -> Vec<Vec<u8>> {
+ (0..n).map(|_| vec![b'A']).collect()
+}
+
+fn make_old_multibyte(n: usize) -> Vec<Vec<u8>> {
+ (0..n).map(|_| vec![0xE2, 0x98, 0x83]).collect()
+}
+
+fn bench_utf8(c: &mut Criterion) {
+ let mut group = c.benchmark_group("utf8_data");
+
+ for size in [100, 1000] {
+ let new_ascii = make_new_ascii(size);
+ let old_ascii = make_old_ascii(size);
+ let new_multi = make_new_multibyte(size);
+ let old_multi = make_old_multibyte(size);
+
+ group.bench_function(format!("new_clone_ascii_{size}"), |b| {
+ b.iter(|| black_box(new_ascii.clone()));
+ });
+ group.bench_function(format!("old_clone_ascii_{size}"), |b| {
+ b.iter(|| black_box(old_ascii.clone()));
+ });
+
+ group.bench_function(format!("new_clone_multibyte_{size}"), |b| {
+ b.iter(|| black_box(new_multi.clone()));
+ });
+ group.bench_function(format!("old_clone_multibyte_{size}"), |b| {
+ b.iter(|| black_box(old_multi.clone()));
+ });
+
+ // --- Construction: raw data structure comparison ---
+ group.bench_function(format!("new_construct_ascii_{size}"), |b| {
+ b.iter(|| {
+ black_box((0..size).map(|_| 'A').collect::<Vec<_>>());
+ });
+ });
+ group.bench_function(format!("old_construct_ascii_{size}"), |b| {
+ b.iter(|| {
+ black_box((0..size).map(|_| vec![b'A']).collect::<Vec<Vec<u8>>>());
+ });
+ });
+
+ group.bench_function(format!("new_construct_multibyte_{size}"), |b| {
+ b.iter(|| {
+ black_box((0..size).map(|_| '\u{2603}').collect::<Vec<_>>());
+ });
+ });
+ group.bench_function(format!("old_construct_multibyte_{size}"), |b| {
+ b.iter(|| {
+ black_box(
+ (0..size)
+ .map(|_| vec![0xE2u8, 0x98, 0x83])
+ .collect::<Vec<Vec<u8>>>(),
+ );
+ });
+ });
+
+ // --- Full bytes→data pipeline (end-to-end) ---
+ // Both paths: validate UTF-8 → decode chars → collect.
+ // New: Vec<Utf8Char> (stack-allocated per char).
+ // Old: Vec<Vec<u8>> (heap-allocated per char).
+ let ascii_bytes: Vec<u8> = "A".repeat(size).into_bytes();
+ let multi_bytes: Vec<u8> = "\u{2603}".repeat(size).into_bytes();
+
+ group.bench_function(format!("new_value_construct_ascii_{size}"), |b| {
+ b.iter(|| {
+ let s = std::str::from_utf8(&ascii_bytes).unwrap();
+ black_box(s.chars().collect::<Vec<_>>());
+ });
+ });
+ group.bench_function(format!("old_value_construct_ascii_{size}"), |b| {
+ b.iter(|| {
+ let s = std::str::from_utf8(&ascii_bytes).unwrap();
+ black_box(
+ s.chars()
+ .map(|c| {
+ let mut buf = vec![0u8; c.len_utf8()];
+ c.encode_utf8(&mut buf);
+ buf
+ })
+ .collect::<Vec<Vec<u8>>>(),
+ );
+ });
+ });
+
+ group.bench_function(format!("new_value_construct_multibyte_{size}"), |b| {
+ b.iter(|| {
+ let s = std::str::from_utf8(&multi_bytes).unwrap();
+ black_box(s.chars().collect::<Vec<_>>());
+ });
+ });
+ group.bench_function(format!("old_value_construct_multibyte_{size}"), |b| {
+ b.iter(|| {
+ let s = std::str::from_utf8(&multi_bytes).unwrap();
+ black_box(
+ s.chars()
+ .map(|c| {
+ let mut buf = vec![0u8; c.len_utf8()];
+ c.encode_utf8(&mut buf);
+ buf
+ })
+ .collect::<Vec<Vec<u8>>>(),
+ );
+ });
+ });
+ }
+
+ group.finish();
+}
+
+criterion_group!(benches, bench_utf8);
+criterion_main!(benches);
diff --git a/clarity-types/src/tests/types/mod.rs b/clarity-types/src/tests/types/mod.rs
@@ -354,12 +354,6 @@ fn test_ascii_data_to_value_returns_clarity_type_error() {
  assert_eq!(ClarityTypeError::InvalidAsciiCharacter(1), err);
 }
 
-#[test]
-fn test_utf8_data_to_value_returns_clarity_types_error_invalid_utf8_encoding() {
- let err = UTF8Data::to_value(&vec![0xED, 0xA0, 0x80]).unwrap_err();
- assert_eq!(ClarityTypeError::InvalidUtf8Encoding, err);
-}
-
 #[test]
 fn test_tuple_data_from_data_typed_returns_clarity_type_error() {
  let tuple_type =
@@ -587,7 +581,7 @@ fn test_ascii_data_len_returns_clarity_type_error() {
 #[test]
 fn test_utf8_data_len_returns_clarity_type_error() {
  let err = UTF8Data {
- data: vec![vec![]; MAX_VALUE_SIZE as usize + 1],
+ data: vec!['\0'; MAX_VALUE_SIZE as usize + 1],
  }
  .len()
  .unwrap_err();
@@ -627,3 +621,107 @@ fn invalid_utf8_string_from_bytes() {
 
  assert!(matches!(err, ClarityTypeError::InvalidUtf8Encoding));
 }
+
+#[test]
+fn utf8data_serde_roundtrip_ascii() {
+ let data = UTF8Data {
+ data: vec!['H', 'i'],
+ };
+ let json = serde_json::to_string(&data).unwrap();
+ let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
+ assert_eq!(data, deserialized);
+}
+
+#[test]
+fn utf8data_serde_roundtrip_multibyte() {
+ // Snowman U+2603
+ let data = UTF8Data {
+ data: vec!['\u{2603}'],
+ };
+ let json = serde_json::to_string(&data).unwrap();
+ let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
+ assert_eq!(data, deserialized);
+}
+
+#[test]
+fn utf8data_serde_roundtrip_four_byte() {
+ // Grinning face U+1F600
+ let data = UTF8Data {
+ data: vec!['\u{1F600}'],
+ };
+ let json = serde_json::to_string(&data).unwrap();
+ let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
+ assert_eq!(data, deserialized);
+}
+
+#[test]
+fn utf8data_serde_roundtrip_mixed() {
+ // "A" + snowman + grinning face
+ let data = UTF8Data {
+ data: vec!['A', '\u{2603}', '\u{1F600}'],
+ };
+ let json = serde_json::to_string(&data).unwrap();
+ let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
+ assert_eq!(data, deserialized);
+}
+
+#[test]
+fn utf8data_serializes_only_significant_bytes() {
+ // ASCII 'A' should serialize as [65], not [65, 0, 0, 0]
+ let data = UTF8Data { data: vec!['A'] };
+ let json = serde_json::to_string(&data).unwrap();
+ assert_eq!(json, "[[65]]");
+}
+
+#[test]
+fn utf8data_serializes_multibyte_significant_bytes() {
+ // Snowman should serialize as [226, 152, 131], not [226, 152, 131, 0]
+ let data = UTF8Data {
+ data: vec!['\u{2603}'],
+ };
+ let json = serde_json::to_string(&data).unwrap();
+ assert_eq!(json, "[[226,152,131]]");
+}
+
+#[test]
+fn utf8data_deserializes_from_old_format() {
+ // Old format: Vec<Vec<u8>> — ensure backward compatibility
+ let old_json = "[[65],[226,152,131],[240,159,152,128]]";
+ let deserialized: UTF8Data = serde_json::from_str(old_json).unwrap();
+ assert_eq!(
+ deserialized,
+ UTF8Data {
+ data: vec!['A', '\u{2603}', '\u{1F600}',],
+ }
+ );
+}
+
+#[test]
+fn utf8data_serde_roundtrip_empty() {
+ let data = UTF8Data { data: vec![] };
+ let json = serde_json::to_string(&data).unwrap();
+ assert_eq!(json, "[]");
+ let deserialized: UTF8Data = serde_json::from_str(&json).unwrap();
+ assert_eq!(data, deserialized);
+}
+
+#[test]
+fn utf8data_deserialize_rejects_invalid_utf8() {
+ // 0xFF is not a valid UTF-8 leading byte
+ let json = "[[255]]";
+ serde_json::from_str::<UTF8Data>(json).unwrap_err();
+}
+
+#[test]
+fn utf8data_deserialize_rejects_empty_entry() {
+ // An empty byte array doesn't represent a character
+ let json = "[[]]";
+ serde_json::from_str::<UTF8Data>(json).unwrap_err();
+}
+
+#[test]
+fn utf8data_deserialize_rejects_multi_codepoint_entry() {
+ // Two ASCII chars in a single entry
+ let json = "[[65, 66]]";
+ serde_json::from_str::<UTF8Data>(json).unwrap_err();
+}
diff --git a/clarity-types/src/tests/types/serialization.rs b/clarity-types/src/tests/types/serialization.rs
@@ -416,25 +416,25 @@ fn test_principals() {
 }
 
 #[test]
-fn test_serialize_to_vec_returns_serialization_failure() {
+fn test_serialize_to_vec_returns_serialization_error_bad_type_error() {
  let value = Value::Sequence(SequenceData::String(CharType::ASCII(ASCIIData {
  data: vec![0; MAX_VALUE_SIZE as usize + 1],
  })));
  let err = value.serialize_to_vec().unwrap_err();
  assert_eq!(
- SerializationError::SerializationFailure(ClarityTypeError::ValueTooLarge.to_string()),
+ SerializationError::BadTypeError(ClarityTypeError::ValueTooLarge),
  err
  );
 }
 
 #[test]
-fn test_serialize_to_hex_returns_serialization_failure() {
+fn test_serialize_to_hex_returns_serialization_error_bad_type_error() {
  let value = Value::Sequence(SequenceData::String(CharType::ASCII(ASCIIData {
  data: vec![0; MAX_VALUE_SIZE as usize + 1],
  })));
  let err = value.serialize_to_hex().unwrap_err();
  assert_eq!(
- SerializationError::SerializationFailure(ClarityTypeError::ValueTooLarge.to_string()),
+ SerializationError::BadTypeError(ClarityTypeError::ValueTooLarge),
  err
  );
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Replaced `Utf8Data` internal representation from `Vec<Vec<u8>>` to `Vec<Utf8Char>` (`[u8; 4]`), reducing heap allocations and speeding up clones.
		Made `Utf8Char` inner field private with validation on deserialization