gnu: Add rust-tokenizers.

* gnu/packages/machine-learning.scm (rust-tokenizers): New variable.

Change-Id: I3189a2d826f072f65ad053d77eb39be39775f1c2
This commit is contained in:
Nicolas Graves 2024-09-07 18:56:14 +02:00 committed by Ricardo Wurmus
parent bd7e89a1fd
commit dadb51bb82
No known key found for this signature in database
GPG key ID: 197A5888235FACAC

View file

@ -5957,6 +5957,66 @@ specialized and not intended for general use.")
python package, but only implements a smaller subset of functions.")
(license license:asl2.0)))
(define-public rust-tokenizers
(package
(name "rust-tokenizers")
(version "0.19.1")
(source
(origin
(method url-fetch)
(uri (crate-uri "tokenizers" version))
(file-name (string-append name "-" version ".tar.gz"))
(sha256
(base32 "1zg6ffpllygijb5bh227m9p4lrhf0pjkysky68kddwrsvp8zl075"))
(modules '((guix build utils)))
(snippet
#~(substitute* "Cargo.toml"
(("0.1.12") ; rust-monostate requires a rust-syn-2 update
"0.1.11")
(("version = \"6.4\"") ; rust-onig
"version = \"6.1.1\"")))))
(build-system cargo-build-system)
(arguments
(list
#:tests? #f ; tests are relying on missing data.
#:cargo-inputs
`(("rust-aho-corasick" ,rust-aho-corasick-1)
("rust-derive-builder" ,rust-derive-builder-0.20)
("rust-esaxx-rs" ,rust-esaxx-rs-0.1)
("rust-fancy-regex" ,rust-fancy-regex-0.13)
("rust-getrandom" ,rust-getrandom-0.2)
("rust-hf-hub" ,rust-hf-hub-0.3)
("rust-indicatif" ,rust-indicatif-0.17)
("rust-itertools" ,rust-itertools-0.12)
("rust-lazy-static" ,rust-lazy-static-1)
("rust-log" ,rust-log-0.4)
("rust-macro-rules-attribute" ,rust-macro-rules-attribute-0.2)
("rust-monostate" ,rust-monostate-0.1)
("rust-onig" ,rust-onig-6)
("rust-paste" ,rust-paste-1)
("rust-rand" ,rust-rand-0.8)
("rust-rayon" ,rust-rayon-1)
("rust-rayon-cond" ,rust-rayon-cond-0.3)
("rust-regex" ,rust-regex-1)
("rust-regex-syntax" ,rust-regex-syntax-0.8)
("rust-serde" ,rust-serde-1)
("rust-serde-json" ,rust-serde-json-1)
("rust-spm-precompiled" ,rust-spm-precompiled-0.1)
("rust-thiserror" ,rust-thiserror-1)
("rust-unicode-normalization-alignments" ,rust-unicode-normalization-alignments-0.1)
("rust-unicode-segmentation" ,rust-unicode-segmentation-1)
("rust-unicode-categories" ,rust-unicode-categories-0.1))
#:cargo-development-inputs
`(("rust-assert-approx-eq" ,rust-assert-approx-eq-1)
("rust-criterion" ,rust-criterion-0.5)
("rust-tempfile" ,rust-tempfile-3))))
(home-page "https://github.com/huggingface/tokenizers")
(synopsis "Implementation of various popular tokenizers")
(description
"This package provides a Rust implementation of today's most used
tokenizers, with a focus on performances and versatility.")
(license license:asl2.0)))
(define-public python-hmmlearn
(package
(name "python-hmmlearn")