Add display capabilities to tokenizers objects #1542

ArthurZucker · 2024-06-03T13:22:21Z

>>> from tokenizers import Tokenizer
>>> Tokenizer.from_pretrained("ArthurZ/new-t5-base")
Tokenizer(normalizer=normalizers.Sequence([normalizers.Precompiled(), normalizers.Strip(strip_left=false, strip_right=true), normalizers.Replace(pattern=Regex(" {2,}"), content="▁", regex=SysRegex { regex: Regex { raw: 0x1069ca350 } }]), pre_tokenizer=PreTokenizer(pretok=Metaspace(replacement='▁', prepend_scheme="first", split=true)), model=Unigram(vocab={'<pad>': 0, '</s>': 0, '<unk>': 0, '▁': -2.012292861938477, 'X': -2.486478805541992, ...}, unk_id=2, bos_id=32101, eos_id=32102), post_processor=TemplateProcessing(single=Template([Sequence { id: A, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }]), pair=Template([Sequence { id: A, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }, Sequence { id: B, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }])), decoder=Metaspace(replacement='▁', prepend_scheme="first", split=true), added_vocab=AddedVocabulary(added_tokens_map_r={
        0: AddedToken(content="<pad>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true), 
        1: AddedToken(content="</s>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true), 
        2: AddedToken(content="<unk>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true), ...}, encode_special_tokens=false), truncation=None, padding=None)

HuggingFaceDocBuilderDev · 2024-06-03T13:24:55Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

bindings/python/src/pre_tokenizers.rs

updates

…add-display fix git suggestion nit __repr__ should use Debug? small updates

…add-display fix git suggestion nit __repr__ should use Debug? small updates Simple lazygit test

McPatate

I'm a bit confused as to why in some cases you impl Display for MyStruct, use derive_more::Display; #[derive(Display)] struct MyStruct and then use StructDisplay.

McPatate · 2024-06-10T14:44:00Z

bindings/python/src/decoders.rs

 #[serde(untagged)]
 pub(crate) enum PyDecoderWrapper {
+    #[display(fmt = "{}", "_0.as_ref().read().unwrap().inner")]


Is this native rust or are these capabilities from the derive_more crate?

(the display macro)

also, are you sure .unwrap is the right thing? Perhaps an .unwrap_or_else(some_default_display_fn) would work best?

McPatate · 2024-06-10T14:45:18Z

bindings/python/src/decoders.rs

    Custom(Arc<RwLock<CustomDecoder>>),
+    #[display(fmt = "{}", "_0.as_ref().read().unwrap()")]


McPatate · 2024-06-10T14:45:27Z

bindings/python/src/models.rs

 /// Base class for all models
 ///
 /// The model represents the actual tokenization algorithm. This is the part that
 /// will contain and manage the learned vocabulary.
 ///
 /// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass(module = "tokenizers.models", name = "Model", subclass)]
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Display)]
+#[display(fmt = "{}", "model.as_ref().read().unwrap()")]


McPatate · 2024-06-10T14:46:44Z

bindings/python/src/models.rs

@@ -220,6 +221,12 @@ impl PyModel {
    fn get_trainer(&self, py: Python<'_>) -> PyResult<PyObject> {
        PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py)
    }
+    fn __str__(&self) -> PyResult<String> {
+        Ok(format!("{}", self.model.read().unwrap()))


If read() returns a Result, then you can probably convert it to a PyResult here rather than unwrapping it.
If it returns an Option, then perhaps returning a default value rather than unwrapping would be preferable.

McPatate · 2024-06-10T14:46:52Z

bindings/python/src/models.rs

+        Ok(format!("{}", self.model.read().unwrap()))
+    }
+    fn __repr__(&self) -> PyResult<String> {
+        Ok(format!("{}", self.model.read().unwrap()))


McPatate · 2024-06-10T14:49:29Z

bindings/python/src/tokenizer.rs

-    #[pyo3(signature = ())]
-    #[pyo3(text_signature = "(self)")]
+    #[getter]


Is this code equivalent?

McPatate · 2024-06-10T14:51:59Z

tokenizers/display_derive/src/lib.rs

+                _ => unimplemented!(),
+            }
+        },
+        _ => unimplemented!(),


Not sure how to handle errors in macros, but I'd take a look rather than leaving a call to unimplemented!

McPatate · 2024-06-10T14:55:14Z

tokenizers/src/pre_tokenizers/byte_level.rs

-use crate::utils::SysRegex;
-use serde::{Deserialize, Serialize};
-
 use crate::tokenizer::{
    Decoder, Encoding, PostProcessor, PreTokenizedString, PreTokenizer, Result,
    SplitDelimiterBehavior,
 };
 use crate::utils::macro_rules_attribute;
+use crate::utils::SysRegex;
+use display_derive::StructDisplay;
+use serde::{Deserialize, Serialize};



are you using rustfmt?

McPatate · 2024-06-10T14:57:52Z

tokenizers/display_derive/src/lib.rs

+                Fields::Named(fields) => {
+                    // If the struct has named fields
+                    let field_names = fields.named.iter().map(|f| &f.ident);
+                    let field_names2 = field_names.clone();


Suggested change

let field_names2 = field_names.clone();

let field_names_clone = field_names.clone();

Also, why do you need to clone?

McPatate · 2024-06-10T15:00:45Z

tokenizers/display_derive/src/lib.rs

+                                        let mut prefix = (&mut chars).take(100 - 1).collect::<String>();
+                                        if chars.next().is_some() {
+                                            prefix.push('…');
+                                        }


Wasn't that what the ellipse crate was for?

yeah but it was too annoying to use 😢

Then remove it from your Cargo.toml file 😉

I think what you wrote is perfectly fine and does not require bringing in the extra crate!

oh I thought I removed it lol on it!

bindings/python/Cargo.toml

…add-display

I wanted to remove the derive more crate and implement stuff

no it's not optimal but I need to go

initial commit

61804d9

ArthurZucker added 15 commits June 3, 2024 15:25

will this work?

a56da5f

make it work for the model for now

f1a6a97

updates

4a49530

update

f4af616

add metaspace

88630dc

update

b9d44da

does not work

a90ec22

current modifications

2224275

current status

4d9204e

working shit

4c2aca1

this kinda works

904ce70

finallllly!

6413810

nits

fda66f5

updates

20c9fc4

almost there

86c77b6

ArthurZucker mentioned this pull request Jun 5, 2024

Adding pretty print of tokenizer #1540

Closed

ArthurZucker commented Jun 5, 2024

View reviewed changes

bindings/python/src/pre_tokenizers.rs Outdated Show resolved Hide resolved

ArthurZucker mentioned this pull request Jun 5, 2024

How can I get the mapping relationship between byte values and Unicode characters of the fast tokenizer? #1545

Open

ArthurZucker and others added 6 commits June 6, 2024 16:34

update

a429642

updates

more nits

3cec010

nit

8d77286

Update bindings/python/src/pre_tokenizers.rs

e48cd3a

ips

27576e5

Merge branch 'add-display' of github.com:huggingface/tokenizers into …

0d9a452

…add-display fix git suggestion nit __repr__ should use Debug? small updates

ArthurZucker force-pushed the add-display branch from f43cefc to 0d9a452 Compare June 7, 2024 09:03

Merge branch 'add-display' of github.com:huggingface/tokenizers into …

35373de

…add-display fix git suggestion nit __repr__ should use Debug? small updates Simple lazygit test

ArthurZucker force-pushed the add-display branch from 0d9a452 to 35373de Compare June 7, 2024 14:36

update

1c6d272

ArthurZucker added 3 commits June 10, 2024 14:22

update

4a34870

update

e0d35e0

some finishing touch

fe95add

haixuanTao mentioned this pull request Jun 10, 2024

Python API automatic __str__ derivation dora-rs/dora#544

Open

McPatate reviewed Jun 10, 2024

View reviewed changes

ArthurZucker commented Jun 10, 2024

View reviewed changes

bindings/python/Cargo.toml Outdated Show resolved Hide resolved

ArthurZucker and others added 24 commits June 10, 2024 18:26

Update bindings/python/Cargo.toml

2770099

nit

3d0eb0a

gracefully handle errors for the proc macro

11a3601

Merge branch 'add-display' of github.com:huggingface/tokenizers into …

f6fa136

…add-display

remove derive_more

2a54482

I wanted to remove the derive more crate and implement stuff

update my custom macro

998b2a3

replace derive more

4df6cc2

Merge branch 'main' into add-display

a9c6c61

stash

aefdc91

updates

f67af9c

update display derive

4c3f37a

blindly fix stuff

292475f

maybe work

99cb054

remove tests from vendored parsing

5c930e9

update

f87bb97

simplify some stuff

c4b4f3c

current status, not bad but not soooooo good

e712079

is this a good start?

5540136

small changes

ba03c16

format does not work yet

d0e741b

some cleanup of unnecessary things

19afb66

nit

9559dea

current status

e53f4ca

let's just go with this

18238dd

no it's not optimal but I need to go

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add display capabilities to tokenizers objects #1542

Add display capabilities to tokenizers objects #1542

ArthurZucker commented Jun 3, 2024 •

edited

Loading

HuggingFaceDocBuilderDev commented Jun 3, 2024

McPatate left a comment

McPatate Jun 10, 2024 •

edited

Loading

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

McPatate Jun 10, 2024

ArthurZucker Jun 10, 2024

McPatate Jun 10, 2024

ArthurZucker Jun 10, 2024

		Custom(Arc<RwLock<CustomDecoder>>),
		#[display(fmt = "{}", "_0.as_ref().read().unwrap()")]

	let field_names2 = field_names.clone();
	let field_names_clone = field_names.clone();

Add display capabilities to tokenizers objects #1542

Are you sure you want to change the base?

Add display capabilities to tokenizers objects #1542

Conversation

ArthurZucker commented Jun 3, 2024 • edited Loading

HuggingFaceDocBuilderDev commented Jun 3, 2024

McPatate left a comment

Choose a reason for hiding this comment

McPatate Jun 10, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ArthurZucker commented Jun 3, 2024 •

edited

Loading

McPatate Jun 10, 2024 •

edited

Loading