mirror of
https://codeberg.org/guix/guix.git
synced 2025-10-02 02:15:12 +00:00
gnu: Add python-tokenizers.
* gnu/packages/machine-learning.scm (python-tokenizers): New variable. Change-Id: I5db95172255dc4635c2a417f3b7252454eea27d7
This commit is contained in:
parent
5aea24cf89
commit
6483fdee51
1 changed files with 111 additions and 0 deletions
|
@ -6117,6 +6117,117 @@ python package, but only implements a smaller subset of functions.")
|
||||||
tokenizers, with a focus on performances and versatility.")
|
tokenizers, with a focus on performances and versatility.")
|
||||||
(license license:asl2.0)))
|
(license license:asl2.0)))
|
||||||
|
|
||||||
|
(define-public python-tokenizers
|
||||||
|
(package
|
||||||
|
(name "python-tokenizers")
|
||||||
|
(version "0.19.1")
|
||||||
|
(source
|
||||||
|
(origin
|
||||||
|
(method url-fetch)
|
||||||
|
(uri (pypi-uri "tokenizers" version))
|
||||||
|
(sha256
|
||||||
|
(base32 "1qw8mjp0q9w7j1raq1rvcbfw38000kbqpwscf9mvxzfh1rlfcngf"))
|
||||||
|
(modules '((guix build utils)
|
||||||
|
(ice-9 ftw)))
|
||||||
|
(snippet
|
||||||
|
#~(begin ;; Only keeping bindings.
|
||||||
|
(for-each (lambda (file)
|
||||||
|
(unless (member file '("." ".." "bindings" "PKG-INFO"))
|
||||||
|
(delete-file-recursively file)))
|
||||||
|
(scandir "."))
|
||||||
|
(for-each (lambda (file)
|
||||||
|
(unless (member file '("." ".."))
|
||||||
|
(rename-file (string-append "bindings/python/" file) file)))
|
||||||
|
(scandir "bindings/python"))
|
||||||
|
(delete-file-recursively ".cargo")))))
|
||||||
|
(build-system cargo-build-system)
|
||||||
|
(arguments
|
||||||
|
(list
|
||||||
|
#:cargo-test-flags ''("--no-default-features")
|
||||||
|
#:imported-modules `(,@%cargo-build-system-modules
|
||||||
|
,@%pyproject-build-system-modules)
|
||||||
|
#:modules '((guix build cargo-build-system)
|
||||||
|
((guix build pyproject-build-system) #:prefix py:)
|
||||||
|
(guix build utils)
|
||||||
|
(ice-9 regex)
|
||||||
|
(ice-9 textual-ports))
|
||||||
|
#:phases
|
||||||
|
#~(modify-phases %standard-phases
|
||||||
|
(add-after 'unpack-rust-crates 'inject-tokenizers
|
||||||
|
(lambda _
|
||||||
|
(substitute* "Cargo.toml"
|
||||||
|
(("\\[dependencies\\]")
|
||||||
|
(format #f "
|
||||||
|
[dev-dependencies]
|
||||||
|
tempfile = ~s
|
||||||
|
pyo3 = { version = ~s, features = [\"auto-initialize\"] }
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
tokenizers = ~s"
|
||||||
|
#$(package-version rust-tempfile-3)
|
||||||
|
#$(package-version rust-pyo3-0.21)
|
||||||
|
#$(package-version rust-tokenizers))))
|
||||||
|
(let ((file-path "Cargo.toml"))
|
||||||
|
(call-with-input-file file-path
|
||||||
|
(lambda (port)
|
||||||
|
(let* ((content (get-string-all port))
|
||||||
|
(top-match (string-match
|
||||||
|
"\\[dependencies.tokenizers" content)))
|
||||||
|
(call-with-output-file file-path
|
||||||
|
(lambda (out)
|
||||||
|
(format out "~a" (match:prefix top-match))))))))))
|
||||||
|
(add-after 'patch-cargo-checksums 'loosen-requirements
|
||||||
|
(lambda _
|
||||||
|
(substitute* "Cargo.toml"
|
||||||
|
(("version = \"6.4\"")
|
||||||
|
(format #f "version = ~s"
|
||||||
|
#$(package-version rust-onig-6))))))
|
||||||
|
(add-after 'check 'python-check
|
||||||
|
(lambda _
|
||||||
|
(copy-file "target/release/libtokenizers.so"
|
||||||
|
"py_src/tokenizers/tokenizers.so")
|
||||||
|
(invoke "python3"
|
||||||
|
"-c" (format #f
|
||||||
|
"import sys; sys.path.append(\"~a/py_src\")"
|
||||||
|
(getcwd))
|
||||||
|
"-m" "pytest"
|
||||||
|
"-s" "-v" "./tests/")))
|
||||||
|
(add-after 'install 'install-python
|
||||||
|
(lambda _
|
||||||
|
(let* ((pversion #$(version-major+minor (package-version python)))
|
||||||
|
(lib (string-append #$output "/lib/python" pversion
|
||||||
|
"/site-packages/"))
|
||||||
|
(info (string-append lib "tokenizers-"
|
||||||
|
#$(package-version this-package)
|
||||||
|
".dist-info")))
|
||||||
|
(mkdir-p info)
|
||||||
|
(copy-file "PKG-INFO" (string-append info "/METADATA"))
|
||||||
|
(copy-recursively
|
||||||
|
"py_src/tokenizers"
|
||||||
|
(string-append lib "tokenizers"))))))
|
||||||
|
#:cargo-inputs
|
||||||
|
`(("rust-rayon" ,rust-rayon-1)
|
||||||
|
("rust-serde" ,rust-serde-1)
|
||||||
|
("rust-serde-json" ,rust-serde-json-1)
|
||||||
|
("rust-libc" ,rust-libc-0.2)
|
||||||
|
("rust-env-logger" ,rust-env-logger-0.11)
|
||||||
|
("rust-pyo3" ,rust-pyo3-0.21)
|
||||||
|
("rust-numpy" ,rust-numpy-0.21)
|
||||||
|
("rust-ndarray" ,rust-ndarray-0.15)
|
||||||
|
("rust-onig" ,rust-onig-6)
|
||||||
|
("rust-itertools" ,rust-itertools-0.12)
|
||||||
|
("rust-tokenizers" ,rust-tokenizers))
|
||||||
|
#:cargo-development-inputs
|
||||||
|
`(("rust-tempfile" ,rust-tempfile-3))))
|
||||||
|
(native-inputs
|
||||||
|
(list python-minimal python-pytest))
|
||||||
|
(home-page "https://huggingface.co/docs/tokenizers")
|
||||||
|
(synopsis "Implementation of various popular tokenizers")
|
||||||
|
(description
|
||||||
|
"This package provides bindings to a Rust implementation of the most used
|
||||||
|
tokenizers, @code{rust-tokenizers}.")
|
||||||
|
(license license:asl2.0)))
|
||||||
|
|
||||||
(define-public python-hmmlearn
|
(define-public python-hmmlearn
|
||||||
(package
|
(package
|
||||||
(name "python-hmmlearn")
|
(name "python-hmmlearn")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue