diff --git a/Cargo.lock b/Cargo.lock index 6bc88a1..0701cd0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,12 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.20" @@ -96,29 +102,12 @@ dependencies = [ "wait-timeout", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.4" @@ -206,19 +195,19 @@ dependencies = [ [[package]] name = "charset-normalizer-rs" -version = "1.1.0" +version = "1.2.0" dependencies = [ "ahash", "assert_cmd", - "bitflags 2.9.4", + "bitflags", "cached", "chardet", "chardetng", - "clap 4.5.47", + "clap", "counter", "criterion", "dialoguer", - "encoding", + "encoding_rs", "env_logger", "icu_normalizer", "icu_properties", @@ -234,14 +223,30 @@ dependencies = [ ] [[package]] -name = "clap" -version = "2.34.0" +name = "ciborium" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ - "bitflags 1.3.2", - "textwrap", - "unicode-width 0.1.14", + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", ] [[package]] @@ -299,7 +304,7 @@ dependencies = [ "encode_unicode", "libc", "once_cell", - "unicode-width 0.2.1", + "unicode-width", "windows-sys 0.59.0", ] @@ -314,25 +319,22 @@ dependencies = [ [[package]] name = "criterion" -version = "0.3.6" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" dependencies = [ - "atty", + "anes", "cast", - "clap 2.34.0", + "ciborium", + "clap", "criterion-plot", - "csv", "itertools", - "lazy_static", "num-traits", "oorandom", "plotters", "rayon", "regex", "serde", - "serde_cbor", - "serde_derive", "serde_json", "tinytemplate", "walkdir", @@ -340,9 +342,9 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.4.5" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" dependencies = [ "cast", "itertools", @@ -374,25 +376,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] -name = "csv" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.12" +name = "crunchy" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" -dependencies = [ - "memchr", -] +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "darling" @@ -476,70 +463,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -640,9 +563,13 @@ dependencies = [ [[package]] name = "half" -version = "1.8.3" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] [[package]] name = "hashbrown" @@ -661,15 +588,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "icu_collections" version = "1.5.0" @@ -802,9 +720,9 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" -version = "0.10.5" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -849,12 +767,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "libc" version = "0.2.175" @@ -1156,7 +1068,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -1194,16 +1106,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - [[package]] name = "serde_core" version = "1.0.225" @@ -1308,15 +1210,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width 0.1.14", -] - [[package]] name = "thiserror" version = "2.0.16" @@ -1363,12 +1256,6 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" -[[package]] -name = "unicode-width" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" - [[package]] name = "unicode-width" version = "0.2.1" @@ -1541,22 +1428,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" version = "0.1.11" @@ -1566,12 +1437,6 @@ dependencies = [ "windows-sys 0.61.0", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-link" version = "0.1.3" diff --git a/Cargo.toml b/Cargo.toml index a99f506..afa5e89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "charset-normalizer-rs" -version = "1.1.0" +version = "1.2.0" authors = ["Nikolay Yarovoy "] edition = "2021" description = "Truly universal encoding detector in pure Rust - port of Python version" @@ -27,7 +27,7 @@ chardetng = { version = "0.1.17", optional = true } clap = { version = "4.4.2", features = ["derive"], optional = true} counter = "0.7.0" dialoguer = { version = "0.10.4", optional = true } -encoding = "0.2.33" +encoding_rs = "0.8.5" env_logger = { version = "0.11.0", optional = true } icu_normalizer = "1.3.2" icu_properties = "1.3.2" @@ -42,16 +42,18 @@ unicode_names2 = "2.0.0" [dev-dependencies] assert_cmd = "2.0.12" -criterion = "0.3" -predicates = "3.0.3" +criterion = "0.7" +predicates = "3.1.3" [[bench]] name = "large_payload" harness = false +required-features = ["performance"] [[bench]] name = "large_datasets" harness = false +required-features = ["performance"] [features] cli = ["clap", "dialoguer", "env_logger"] diff --git a/src/assets.rs b/src/assets.rs index 0f20934..7cf5fa0 100644 --- a/src/assets.rs +++ b/src/assets.rs @@ -1,66 +1,85 @@ use crate::entity::Language; -use ahash::HashMap; +use ahash::HashSet; use once_cell::sync::Lazy; -use std::iter::FromIterator; -pub(crate) static LANGUAGES: Lazy<[(Language, &'static str, bool, bool); 41]> = Lazy::new(|| { - [ - // language, alphabet, have_accents, pure_latin - (Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ), - (Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ), - (Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ), - (Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ), - (Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ), - (Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ), - (Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ), - (Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ), - (Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ), - (Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ), - (Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ), - (Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ), - (Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ), - (Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ), - (Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ), - (Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ), - (Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ), - (Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ), - (Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ), - (Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ), - (Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ), - (Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ), - (Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ), - (Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ), - (Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ), - (Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ), - (Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ), - (Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ), - (Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ), - (Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ), - (Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ), - (Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ), - (Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ), - (Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ), - (Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ), - (Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ), - (Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ), - (Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ), - (Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ), - (Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ), - (Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ), -] -}); -pub(crate) static LANGUAGE_SUPPORTED_COUNT: Lazy = Lazy::new(|| LANGUAGES.len()); // 41 +pub(crate) struct LanguageEntry { + pub language: Language, + pub alphabet: &'static str, + pub alphabet_set: HashSet, + pub have_accents: bool, + pub pure_latin: bool, +} + +impl LanguageEntry { + pub fn new( + language: Language, + alphabet: &'static str, + have_accents: bool, + pure_latin: bool, + ) -> Self { + Self { + language, + alphabet, + alphabet_set: alphabet.chars().collect(), + have_accents, + pure_latin, + } + } + + pub fn get(language: &Language) -> Result<&Self, String> { + for entry in LANGUAGES.iter() { + if entry.language == *language { + return Ok(entry); + } + } + Err(String::from("Language wasn't found")) + } +} -pub(crate) static ENCODING_TO_LANGUAGE: Lazy> = Lazy::new(|| { - HashMap::from_iter([ - ("euc-kr", Language::Korean), - ("big5", Language::Chinese), - ("hz", Language::Chinese), - ("gbk", Language::Chinese), - ("gb18030", Language::Chinese), - ("euc-jp", Language::Japanese), - ("iso-2022-jp", Language::Japanese), - ("shift_jis", Language::Japanese), - ]) +pub(crate) static LANGUAGES: Lazy> = Lazy::new(|| { + vec![ + // language, alphabet, have_accents, pure_latin + LanguageEntry::new(Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ), + LanguageEntry::new(Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ), + LanguageEntry::new(Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ), + LanguageEntry::new(Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ), + LanguageEntry::new(Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ), + LanguageEntry::new(Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ), + LanguageEntry::new(Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ), + LanguageEntry::new(Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ), + LanguageEntry::new(Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ), + LanguageEntry::new(Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ), + LanguageEntry::new(Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ), + LanguageEntry::new(Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ), + LanguageEntry::new(Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ), + LanguageEntry::new(Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ), + LanguageEntry::new(Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ), + LanguageEntry::new(Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ), + LanguageEntry::new(Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ), + LanguageEntry::new(Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ), + LanguageEntry::new(Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ), + LanguageEntry::new(Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ), + LanguageEntry::new(Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ), + LanguageEntry::new(Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ), + LanguageEntry::new(Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ), + LanguageEntry::new(Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ), + LanguageEntry::new(Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ), + LanguageEntry::new(Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ), + LanguageEntry::new(Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ), + LanguageEntry::new(Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ), + LanguageEntry::new(Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ), + LanguageEntry::new(Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ), + LanguageEntry::new(Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ), + LanguageEntry::new(Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ), + LanguageEntry::new(Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ), + LanguageEntry::new(Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ), + LanguageEntry::new(Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ), + LanguageEntry::new(Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ), + LanguageEntry::new(Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ), + LanguageEntry::new(Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ), + LanguageEntry::new(Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ), + LanguageEntry::new(Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ), + LanguageEntry::new(Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ), + ] }); diff --git a/src/cd.rs b/src/cd.rs index d60ae27..c787e8f 100644 --- a/src/cd.rs +++ b/src/cd.rs @@ -1,16 +1,14 @@ #![allow(unused_variables)] -use crate::assets::{ENCODING_TO_LANGUAGE, LANGUAGES, LANGUAGE_SUPPORTED_COUNT}; +use crate::assets::{LanguageEntry, LANGUAGES}; use crate::consts::TOO_SMALL_SEQUENCE; +use crate::enc::{Encoding, IsChunk, WantDecode}; use crate::entity::{CoherenceMatch, CoherenceMatches, Language}; use crate::utils::{ - get_language_data, is_accentuated, is_multi_byte_encoding, is_suspiciously_successive_range, - is_unicode_range_secondary, unicode_range, + is_accentuated, is_suspiciously_successive_range, is_unicode_range_secondary, unicode_range, }; use ahash::{HashMap, HashMapExt, HashSet}; use cached::proc_macro::cached; use counter::Counter; -use encoding::label::encoding_from_whatwg_label; -use encoding::DecoderTrap; use ordered_float::OrderedFloat; use strsim::jaro; @@ -20,18 +18,19 @@ use strsim::jaro; // Return associated unicode ranges in a single byte code page. pub(crate) fn encoding_unicode_range(iana_name: &str) -> Result, String> { - if is_multi_byte_encoding(iana_name) { + let encoder = + Encoding::by_name(iana_name).ok_or("No decoder found for this encoding".to_string())?; + + if encoder.is_multi_byte_encoding() { return Err("Function not supported on multi-byte code page".to_string()); } - let encoder = encoding_from_whatwg_label(iana_name) - .ok_or("No decoder found for this encoding".to_string())?; let byte_range = 0x40..0xFF; // utf8 range. range.len()==191 let mut result: HashMap<&str, u8> = HashMap::with_capacity(byte_range.len()); byte_range.for_each(|i| { if let Some(range) = encoder - .decode(&[i], DecoderTrap::Ignore) + .decode(&[i], WantDecode::Yes, IsChunk::No) .ok() .and_then(|chunk| chunk.chars().next()) .and_then(unicode_range) @@ -55,11 +54,12 @@ pub(crate) fn encoding_unicode_range(iana_name: &str) -> Result, Strin pub(crate) fn unicode_range_languages(primary_range: &str) -> Vec<&'static Language> { LANGUAGES .iter() - .filter_map(|(language, characters, _, _)| { - characters + .filter_map(|entry| { + entry + .alphabet .chars() .find(|char| unicode_range(*char).unwrap_or_default() == primary_range) - .map(|_| language) + .map(|_| &entry.language) }) .collect::>() } @@ -68,8 +68,8 @@ pub(crate) fn unicode_range_languages(primary_range: &str) -> Vec<&'static Langu // Some code page are heavily linked to particular language(s). // This function does the correspondence. #[cached(size = 128)] -pub(crate) fn encoding_languages(iana_name: String) -> Vec<&'static Language> { - match encoding_unicode_range(&iana_name) +pub(crate) fn encoding_languages(iana_name: &'static str) -> Vec<&'static Language> { + match encoding_unicode_range(iana_name) .unwrap_or_default() .iter() .find(|&&range| !range.contains("Latin")) @@ -79,43 +79,30 @@ pub(crate) fn encoding_languages(iana_name: String) -> Vec<&'static Language> { } } -// Multi-byte encoding language association. Some code page are heavily linked to particular language(s). -// This function does the correspondence. -pub(crate) fn mb_encoding_languages(iana_name: &str) -> Vec<&'static Language> { - ENCODING_TO_LANGUAGE - .get(iana_name) - .map_or(vec![], |found| vec![found]) -} - // Return associated languages associated to given characters -#[allow(clippy::ptr_arg)] pub(crate) fn alphabet_languages( characters: &[char], ignore_non_latin: bool, ) -> Vec<&'static Language> { - let mut languages: Vec<(&Language, OrderedFloat)> = - Vec::with_capacity(*LANGUAGE_SUPPORTED_COUNT); + let mut languages: Vec<(&Language, OrderedFloat)> = Vec::with_capacity(LANGUAGES.len()); let source_characters_set: HashSet = characters.iter().copied().collect(); let source_has_accents = source_characters_set .iter() .any(|&char| is_accentuated(char)); - for (language, language_characters, target_have_accents, target_pure_latin) in LANGUAGES.iter() - { - if (ignore_non_latin && !target_pure_latin) || (!target_have_accents && source_has_accents) - { + for entry in LANGUAGES.iter() { + if (ignore_non_latin && !entry.pure_latin) || (!entry.have_accents && source_has_accents) { continue; } - let language_characters_set: HashSet = language_characters.chars().collect(); - let intersection: HashSet = language_characters_set + let intersection_size = entry + .alphabet_set .intersection(&source_characters_set) - .copied() - .collect(); + .count(); - let ratio: f32 = intersection.len() as f32 / language_characters_set.len() as f32; + let ratio: f32 = intersection_size as f32 / entry.alphabet_set.len() as f32; if ratio >= 0.2 { - languages.push((language, OrderedFloat(ratio))); + languages.push((&entry.language, OrderedFloat(ratio))); } } // reverse sort @@ -152,8 +139,8 @@ pub(crate) fn characters_popularity_compare( language: &Language, ordered_characters: &str, ) -> Result { - let language_data = get_language_data(language)?; - Ok(jaro(ordered_characters, language_data.0) as f32) + let language_data = LanguageEntry::get(language)?; + Ok(jaro(ordered_characters, language_data.alphabet) as f32) } // We shall NOT return more than one "English" in CoherenceMatches because it is an alternative diff --git a/src/consts.rs b/src/consts.rs index 4a1fd7e..9d2caf5 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,8 +1,7 @@ use ahash::{HashMap, HashSet}; use core::ops::RangeInclusive; -use encoding::all::encodings; use once_cell::sync::Lazy; -use regex::Regex; +use regex::bytes::Regex; pub static TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 pub(crate) static MAX_PROCESSED_BYTES: usize = 500_000; @@ -334,327 +333,6 @@ pub(crate) static RE_POSSIBLE_ENCODING_INDICATION: Lazy = Lazy::new(|| { ).unwrap() }); -pub static IANA_SUPPORTED: Lazy> = Lazy::new(|| { - encodings() - .iter() - .filter(|&enc| !["error", "encoder-only-utf-8", "pua-mapped-binary"].contains(&enc.name())) - .map(|&enc| enc.whatwg_name().unwrap_or(enc.name())) - .collect() -}); - -pub static IANA_SUPPORTED_COUNT: Lazy = Lazy::new(|| IANA_SUPPORTED.len()); - -// chardet encoding names (in lowercase!) -pub static CHARDET_CORRESPONDENCE: Lazy> = Lazy::new(|| { - HashMap::from_iter([ - ("tis-620", "windows-874"), - ("utf-16", "utf-16le"), - ("maccyrillic", "x-mac-cyrillic"), - ("gb2312", "gbk"), - ("cp949", "euc-kr"), - ]) -}); - -// aliases (labels) are from https://encoding.spec.whatwg.org/#concept-encoding-get -> as is + lowercased -pub static IANA_SUPPORTED_ALIASES: Lazy>> = - Lazy::new(|| { - HashMap::from_iter([ - ( - "utf-8", - vec![ - "unicode-1-1-utf-8", - "unicode11utf8", - "unicode20utf8", - "utf-8", - "utf8", - "x-unicode20utf8", - ], - ), - ("ibm866", vec!["866", "cp866", "csibm866", "ibm866"]), - ( - "iso-8859-2", - vec![ - "csisolatin2", - "iso-8859-2", - "iso-ir-101", - "iso8859-2", - "iso88592", - "iso_8859-2", - "iso_8859-2:1987", - "l2", - "latin2", - ], - ), - ( - "iso-8859-3", - vec![ - "csisolatin3", - "iso-8859-3", - "iso-ir-109", - "iso8859-3", - "iso88593", - "iso_8859-3", - "iso_8859-3:1988", - "l3", - "latin3", - ], - ), - ( - "iso-8859-4", - vec![ - "csisolatin4", - "iso-8859-4", - "iso-ir-110", - "iso8859-4", - "iso88594", - "iso_8859-4", - "iso_8859-4:1988", - "l4", - "latin4", - ], - ), - ( - "iso-8859-5", - vec![ - "csisolatincyrillic", - "cyrillic", - "iso-8859-5", - "iso-ir-144", - "iso8859-5", - "iso88595", - "iso_8859-5", - "iso_8859-5:1988", - ], - ), - ( - "iso-8859-6", - vec![ - "arabic", - "asmo-708", - "csiso88596e", - "csiso88596i", - "csisolatinarabic", - "ecma-114", - "iso-8859-6", - "iso-8859-6-e", - "iso-8859-6-i", - "iso-ir-127", - "iso8859-6", - "iso88596", - "iso_8859-6", - "iso_8859-6:1987", - ], - ), - ( - "iso-8859-7", - vec![ - "csisolatingreek", - "ecma-118", - "elot_928", - "greek", - "greek8", - "iso-8859-7", - "iso-ir-126", - "iso8859-7", - "iso88597", - "iso_8859-7", - "iso_8859-7:1987", - "sun_eu_greek", - ], - ), - ( - "iso-8859-8", - vec![ - "csiso88598e", - "csisolatinhebrew", - "hebrew", - "iso-8859-8", - "iso-8859-8-e", - "iso-ir-138", - "iso8859-8", - "iso88598", - "iso_8859-8", - "iso_8859-8:1988", - "visual", - ], - ), - ( - "iso-8859-8-i", - vec!["csiso88598i", "iso-8859-8-i", "logical"], - ), - ( - "iso-8859-10", - vec![ - "csisolatin6", - "iso-8859-10", - "iso-ir-157", - "iso8859-10", - "iso885910", - "l6", - "latin6", - ], - ), - ( - "iso-8859-13", - vec!["iso-8859-13", "iso8859-13", "iso885913"], - ), - ( - "iso-8859-14", - vec!["iso-8859-14", "iso8859-14", "iso885914"], - ), - ( - "iso-8859-15", - vec![ - "csisolatin9", - "iso-8859-15", - "iso8859-15", - "iso885915", - "iso_8859-15", - "l9", - ], - ), - ("iso-8859-16", vec!["iso-8859-16"]), - ("koi8-r", vec!["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"]), - ("koi8-u", vec!["koi8-ru", "koi8-u"]), - ( - "macintosh", - vec!["csmacintosh", "mac", "macintosh", "x-mac-roman"], - ), - ( - "windows-874", - vec![ - "dos-874", - "iso-8859-11", - "iso8859-11", - "iso885911", - "tis-620", - "windows-874", - ], - ), - ("windows-1250", vec!["cp1250", "windows-1250", "x-cp1250"]), - ("windows-1251", vec!["cp1251", "windows-1251", "x-cp1251"]), - ( - "windows-1252", - vec![ - "ansi_x3.4-1968", - "ascii", - "cp1252", - "cp819", - "csisolatin1", - "ibm819", - "iso-8859-1", - "iso-ir-100", - "iso8859-1", - "iso88591", - "iso_8859-1", - "iso_8859-1:1987", - "l1", - "latin1", - "us-ascii", - "windows-1252", - "x-cp1252", - ], - ), - ("windows-1253", vec!["cp1253", "windows-1253", "x-cp1253"]), - ( - "windows-1254", - vec![ - "cp1254", - "csisolatin5", - "iso-8859-9", - "iso-ir-148", - "iso8859-9", - "iso88599", - "iso_8859-9", - "iso_8859-9:1989", - "l5", - "latin5", - "windows-1254", - "x-cp1254", - ], - ), - ("windows-1255", vec!["cp1255", "windows-1255", "x-cp1255"]), - ("windows-1256", vec!["cp1256", "windows-1256", "x-cp1256"]), - ("windows-1257", vec!["cp1257", "windows-1257", "x-cp1257"]), - ("windows-1258", vec!["cp1258", "windows-1258", "x-cp1258"]), - ("x-mac-cyrillic", vec!["x-mac-cyrillic", "x-mac-ukrainian"]), - ( - "gbk", - vec![ - "chinese", - "csgb2312", - "csiso58gb231280", - "gb2312", - "gb_2312", - "gb_2312-80", - "gbk", - "iso-ir-58", - "x-gbk", - ], - ), - ("gb18030", vec!["gb18030"]), - ( - "big5", - vec!["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"], - ), - ("euc-jp", vec!["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]), - ("iso-2022-jp", vec!["csiso2022jp", "iso-2022-jp"]), - ( - "shift_jis", - vec![ - "csshiftjis", - "ms932", - "ms_kanji", - "shift-jis", - "shift_jis", - "sjis", - "windows-31j", - "x-sjis", - ], - ), - ( - "euc-kr", - vec![ - "cseuckr", - "csksc56011987", - "euc-kr", - "iso-ir-149", - "korean", - "ks_c_5601-1987", - "ks_c_5601-1989", - "ksc5601", - "ksc_5601", - "windows-949", - ], - ), - ( - "replacement", - vec![ - "csiso2022kr", - "hz-gb-2312", - "iso-2022-cn", - "iso-2022-cn-ext", - "iso-2022-kr", - "replacement", - ], - ), - ("utf-16be", vec!["unicodefffe", "utf-16be"]), - ( - "utf-16le", - vec![ - "csunicode", - "iso-10646-ucs-2", - "ucs-2", - "unicode", - "unicodefeff", - "utf-16", - "utf-16le", - ], - ), - ("x-user-defined", vec!["x-user-defined"]), - ]) - }); - pub static IANA_SUPPORTED_SIMILAR: Lazy>> = Lazy::new(|| { HashMap::from_iter([ diff --git a/src/enc.rs b/src/enc.rs new file mode 100644 index 0000000..80b588e --- /dev/null +++ b/src/enc.rs @@ -0,0 +1,823 @@ +use crate::entity::Language; +use encoding_rs::DecoderResult; +use encoding_rs::Encoding as EncodingImpl; +use once_cell::sync::Lazy; +use std::collections::HashMap; + +/// Represents a character set encoding scheme +#[derive(Copy, Clone)] +pub struct Encoding { + /// Canonical name + name: &'static str, + is_multi_byte_encoding: bool, + /// Acceptable aliases from -> as is + lowercased + aliases: &'static [&'static str], + + encoder_impl: Option<&'static EncodingImpl>, + language: Option, +} + +impl std::fmt::Display for Encoding { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + self.name.fmt(fmt) + } +} + +impl std::fmt::Debug for Encoding { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + self.name.fmt(fmt) + } +} + +impl PartialEq for Encoding { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + } +} + +impl Eq for Encoding {} + +impl std::hash::Hash for Encoding { + fn hash(&self, h: &mut H) + where + H: std::hash::Hasher, + { + self.name.hash(h) + } +} + +/// Whether the input should be processed chunk-wise. +/// If so, the decode will nibble off the start/end +/// of the buffer to find a subset that successfully +/// decodes +#[derive(PartialEq, Copy, Clone, Debug)] +pub(crate) enum IsChunk { + Yes, + No, +} + +/// Whether the full decoded output is required. +/// If not, memory utilization can be reduced by +/// using smaller or no buffer to hold the decoded +/// result; an empty or partial string will be +/// returned from the decode +#[derive(PartialEq, Copy, Clone, Debug)] +pub(crate) enum WantDecode { + Yes, + No, +} + +impl Encoding { + /// Given a charset encoding name or label, return an `Encoding` + /// object that corresponds to the implementation of that scheme. + /// Can return None if the name is unknown. Supports a number + /// of standard aliases as well as case insensitive names. + pub fn by_name(name: &str) -> Option<&'static Encoding> { + match BY_NAME.get(name) { + Some(enc) => Some(enc), + None => { + if name.chars().any(|c| c.is_ascii_uppercase()) { + Self::by_name(&name.to_lowercase()) + } else { + None + } + } + } + } + + /// Returns the list of aliases by which this encoding instance + /// is known + pub fn aliases(&self) -> &'static [&'static str] { + self.aliases + } + + /// Returns the canonical name of this encoding + pub fn name(&self) -> &str { + self.name + } + + /// Returns true if this encoding scheme requires a byte order marker + pub fn requires_bom(&self) -> bool { + matches!(self.name, "utf-16le" | "utf-16be") + } + + /// Returns true if this encoding potentially encodes code points using + /// sequences of more than a single byte + pub fn is_multi_byte_encoding(&self) -> bool { + self.is_multi_byte_encoding + } + + /// Multi-byte encoding language association. + /// Some code page are heavily linked to particular language(s). + pub fn language(&self) -> Option<&Language> { + self.language.as_ref() + } + + /// Encodes a unicode string into a sequence of bytes + /// If ignore_errors is true, returns whatever the underlying + /// encoder managed to encode if there was some error processing + /// the encode operation. + /// + /// Note that this is, barring errors, the symmetric operation to + /// the decode method. + pub fn encode(&self, input: &str, ignore_errors: bool) -> Result, String> { + match self.encoder_impl { + None => Ok(input.as_bytes().to_vec()), + Some(enc) => { + match self.name() { + // encoding_rs has the slightly surprising behavior + // of encoding utf-16 as utf8 (because that is what + // should be used for the web), so we need to handle + // that encoding case for ourselves here. + "utf-16le" => { + let mut bytes = vec![]; + for c in input.encode_utf16() { + for b in c.to_le_bytes() { + bytes.push(b); + } + } + Ok(bytes) + } + "utf-16be" => { + let mut bytes = vec![]; + for c in input.encode_utf16() { + for b in c.to_be_bytes() { + bytes.push(b); + } + } + Ok(bytes) + } + _ => { + let (cow, used, ok) = enc.encode(input); + if ok || ignore_errors { + Ok(cow.into()) + } else { + Err(format!( + "encoding replaced chars. used={}, {cow:x?}", + used.name() + )) + } + } + } + } + } + } + + /// Attempts to decode a sequence of bytes using this encoding scheme + pub fn decode_simple(&self, input: &[u8]) -> Result { + self.decode(input, WantDecode::Yes, IsChunk::No) + } + + pub(crate) fn decode( + &self, + input: &[u8], + want_decode: WantDecode, + is_chunk: IsChunk, + ) -> Result { + match self.encoder_impl { + // The ascii special case + None => { + let len = input.len(); + let valid_to = encoding_rs::Encoding::ascii_valid_up_to(input); + if valid_to != len { + Err(format!("8-bit input detected at index {valid_to}")) + } else { + match want_decode { + WantDecode::Yes => Ok(std::str::from_utf8(input) + .map_err(|err| format!("{err:#}"))? + .to_string()), + WantDecode::No => Ok(String::new()), + } + } + } + Some(enc) => { + let mut begin_offset = 0; + let mut end_offset = input.len(); + + loop { + let chunk = &input[begin_offset..end_offset]; + + match decode_buffer(enc, chunk, want_decode) { + BufferResult::Decoded(result) => return Ok(result), + BufferResult::Adjust { + begin, + end, + consumed, + } => { + let mut terminate = false; + match is_chunk { + IsChunk::Yes => { + if consumed <= 1 { + // Bad sequence at the start + begin_offset += begin; + } else { + end_offset = end_offset.saturating_sub(end); + } + + if end_offset - begin_offset < 1 + || begin_offset > 3 + || input.len() - end_offset > 3 + { + terminate = true; + } + } + IsChunk::No => { + terminate = true; + } + } + + if terminate { + if consumed <= 1 { + return Err(format!("invalid sequence at {consumed}")); + } + return Err(format!("incomplete sequence at {consumed}")); + } + } + } + } + } + } + } +} + +enum BufferResult { + Decoded(String), + Adjust { + consumed: usize, + begin: usize, + end: usize, + }, +} + +/// Decode or, if WantDecode::No, validate the decode of, buffer. +/// If the decode is successful, returns either the decoded buffer +/// or an empty string depending on WantDecode. +/// If the decode fails, returns an Adjust variant indicating +/// how much was consumed and where to adjust the buffer if IsChunk +/// is being used. +fn decode_buffer( + encoding: &'static encoding_rs::Encoding, + mut chunk: &[u8], + want_decode: WantDecode, +) -> BufferResult { + let mut decoder = encoding.new_decoder(); + + const CHUNK_SIZE: usize = 2048; + let mut buffer_bytes = [0u8; CHUNK_SIZE]; + let buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap(); + let mut bytes_in_buffer = 0usize; + let mut result_string = String::new(); + let mut consumed = 0usize; + + loop { + let (result, read, written) = + decoder.decode_to_str_without_replacement(chunk, &mut buffer[bytes_in_buffer..], false); + + consumed += read; + bytes_in_buffer += written; + + match result { + DecoderResult::InputEmpty => { + break; + } + DecoderResult::OutputFull => { + match want_decode { + WantDecode::Yes => { + result_string.push_str(&buffer[..bytes_in_buffer]); + } + WantDecode::No => {} + } + bytes_in_buffer = 0; + chunk = &chunk[read..]; + continue; + } + DecoderResult::Malformed(len, consumed_after) => { + if consumed <= 1 { + // Bad sequence at the start + return BufferResult::Adjust { + begin: (len + consumed_after).max(1) as usize, + end: 0, + consumed, + }; + } else { + return BufferResult::Adjust { + begin: 0, + end: 1, + consumed, + }; + } + } + } + } + // Flush any buffered output if needed + match want_decode { + WantDecode::Yes => { + loop { + let (result, _, written) = decoder.decode_to_str_without_replacement( + b"", + &mut buffer[bytes_in_buffer..], + true, + ); + bytes_in_buffer += written; + // Write the current buffer out and consider the buffer empty. + // Need to do this here for both `match` arms, because we exit the + // loop on `DecoderResult::InputEmpty`. + result_string.push_str(&buffer[..bytes_in_buffer]); + bytes_in_buffer = 0usize; + + match result { + DecoderResult::InputEmpty => { + // Done! + break; + } + DecoderResult::OutputFull => { + continue; + } + DecoderResult::Malformed(len, consumed_after) => { + if consumed <= 1 { + // Bad sequence at the start + return BufferResult::Adjust { + begin: (len + consumed_after).max(1) as usize, + end: 0, + consumed, + }; + } else { + return BufferResult::Adjust { + begin: 0, + end: 1, + consumed, + }; + } + } + } + } + } + WantDecode::No => {} + } + + BufferResult::Decoded(result_string) +} + +pub(crate) static BY_NAME: Lazy> = Lazy::new(|| { + let mut map = HashMap::new(); + for enc in ALL { + for &name in enc.aliases { + map.insert(name, enc); + } + } + map +}); + +/// All known/supported `Encoding`s known to this crate +pub static ALL: &[Encoding] = &[ + Encoding { + // See comments in windows-1252 below re: ascii aliasing with cp1252 + // and why that isn't the case here + name: "ascii", + is_multi_byte_encoding: false, + aliases: &["ascii", "us-ascii"], + encoder_impl: None, + language: None, + }, + Encoding { + name: "ibm866", + is_multi_byte_encoding: false, + aliases: &["866", "cp866", "csibm866", "ibm866"], + encoder_impl: Some(encoding_rs::IBM866), + language: None, + }, + Encoding { + name: "iso-8859-2", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatin2", + "iso-8859-2", + "iso-ir-101", + "iso8859-2", + "iso88592", + "iso_8859-2", + "iso_8859-2:1987", + "l2", + "latin2", + ], + encoder_impl: Some(encoding_rs::ISO_8859_2), + language: None, + }, + Encoding { + name: "iso-8859-3", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatin3", + "iso-8859-3", + "iso-ir-109", + "iso8859-3", + "iso88593", + "iso_8859-3", + "iso_8859-3:1988", + "l3", + "latin3", + ], + encoder_impl: Some(encoding_rs::ISO_8859_3), + language: None, + }, + Encoding { + name: "iso-8859-4", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatin4", + "iso-8859-4", + "iso-ir-110", + "iso8859-4", + "iso88594", + "iso_8859-4", + "iso_8859-4:1988", + "l4", + "latin4", + ], + encoder_impl: Some(encoding_rs::ISO_8859_4), + language: None, + }, + Encoding { + name: "iso-8859-5", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatincyrillic", + "cyrillic", + "iso-8859-5", + "iso-ir-144", + "iso8859-5", + "iso88595", + "iso_8859-5", + "iso_8859-5:1988", + ], + encoder_impl: Some(encoding_rs::ISO_8859_5), + language: None, + }, + Encoding { + name: "iso-8859-6", + is_multi_byte_encoding: false, + aliases: &[ + "arabic", + "asmo-708", + "csiso88596e", + "csiso88596i", + "csisolatinarabic", + "ecma-114", + "iso-8859-6", + "iso-8859-6-e", + "iso-8859-6-i", + "iso-ir-127", + "iso8859-6", + "iso88596", + "iso_8859-6", + "iso_8859-6:1987", + ], + encoder_impl: Some(encoding_rs::ISO_8859_6), + language: None, + }, + Encoding { + name: "iso-8859-7", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatingreek", + "ecma-118", + "elot_928", + "greek", + "greek8", + "iso-8859-7", + "iso-ir-126", + "iso8859-7", + "iso88597", + "iso_8859-7", + "iso_8859-7:1987", + "sun_eu_greek", + ], + encoder_impl: Some(encoding_rs::ISO_8859_7), + language: None, + }, + Encoding { + name: "iso-8859-8", + is_multi_byte_encoding: false, + aliases: &[ + "csiso88598e", + "csisolatinhebrew", + "hebrew", + "iso-8859-8", + "iso-8859-8-e", + "iso-ir-138", + "iso8859-8", + "iso88598", + "iso_8859-8", + "iso_8859-8:1988", + "visual", + ], + encoder_impl: Some(encoding_rs::ISO_8859_8), + language: None, + }, + Encoding { + name: "iso-8859-10", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatin6", + "iso-8859-10", + "iso-ir-157", + "iso8859-10", + "iso885910", + "l6", + "latin6", + ], + encoder_impl: Some(encoding_rs::ISO_8859_10), + language: None, + }, + Encoding { + name: "iso-8859-13", + is_multi_byte_encoding: false, + aliases: &["iso-8859-13", "iso8859-13", "iso885913"], + encoder_impl: Some(encoding_rs::ISO_8859_13), + language: None, + }, + Encoding { + name: "iso-8859-14", + is_multi_byte_encoding: false, + aliases: &["iso-8859-14", "iso8859-14", "iso885914"], + encoder_impl: Some(encoding_rs::ISO_8859_14), + language: None, + }, + Encoding { + name: "iso-8859-15", + is_multi_byte_encoding: false, + aliases: &[ + "csisolatin9", + "iso-8859-15", + "iso8859-15", + "iso885915", + "iso_8859-15", + "l9", + ], + encoder_impl: Some(encoding_rs::ISO_8859_15), + language: None, + }, + Encoding { + name: "iso-8859-16", + is_multi_byte_encoding: false, + aliases: &["iso-8859-16"], + encoder_impl: Some(encoding_rs::ISO_8859_16), + language: None, + }, + Encoding { + name: "koi8-r", + is_multi_byte_encoding: false, + aliases: &["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"], + encoder_impl: Some(encoding_rs::KOI8_R), + language: None, + }, + Encoding { + name: "koi8-u", + is_multi_byte_encoding: false, + aliases: &["koi8-ru", "koi8-u"], + encoder_impl: Some(encoding_rs::KOI8_U), + language: None, + }, + Encoding { + name: "macintosh", + is_multi_byte_encoding: false, + aliases: &["csmacintosh", "mac", "macintosh", "x-mac-roman"], + encoder_impl: Some(encoding_rs::MACINTOSH), + language: None, + }, + Encoding { + name: "windows-874", + is_multi_byte_encoding: false, + aliases: &[ + "dos-874", + "iso-8859-11", + "iso8859-11", + "iso885911", + "tis-620", + "windows-874", + ], + encoder_impl: Some(encoding_rs::WINDOWS_874), + language: None, + }, + Encoding { + name: "windows-1250", + is_multi_byte_encoding: false, + aliases: &["cp1250", "windows-1250", "x-cp1250"], + encoder_impl: Some(encoding_rs::WINDOWS_1250), + language: None, + }, + Encoding { + name: "windows-1251", + is_multi_byte_encoding: false, + aliases: &["cp1251", "windows-1251", "x-cp1251"], + encoder_impl: Some(encoding_rs::WINDOWS_1251), + language: None, + }, + Encoding { + name: "windows-1252", + is_multi_byte_encoding: false, + aliases: &[ + "ansi_x3.4-1968", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "windows-1252", + "x-cp1252", + // Note: + // specifies that ascii is simply an alias for cp1252, but + // the various detection tests in this crate will fail if + // we make it a strict alias, so we have a separate ascii + // Encoding object and do not include the ascii aliases here + // "ascii", + // "us-ascii", + ], + encoder_impl: Some(encoding_rs::WINDOWS_1252), + language: None, + }, + Encoding { + name: "windows-1253", + is_multi_byte_encoding: false, + aliases: &["cp1253", "windows-1253", "x-cp1253"], + encoder_impl: Some(encoding_rs::WINDOWS_1253), + language: None, + }, + Encoding { + name: "windows-1254", + is_multi_byte_encoding: false, + aliases: &[ + "cp1254", + "csisolatin5", + "iso-8859-9", + "iso-ir-148", + "iso8859-9", + "iso88599", + "iso_8859-9", + "iso_8859-9:1989", + "l5", + "latin5", + "windows-1254", + "x-cp1254", + ], + encoder_impl: Some(encoding_rs::WINDOWS_1254), + language: None, + }, + Encoding { + name: "windows-1255", + is_multi_byte_encoding: false, + aliases: &["cp1255", "windows-1255", "x-cp1255"], + encoder_impl: Some(encoding_rs::WINDOWS_1255), + language: None, + }, + Encoding { + name: "windows-1256", + is_multi_byte_encoding: false, + aliases: &["cp1256", "windows-1256", "x-cp1256"], + encoder_impl: Some(encoding_rs::WINDOWS_1256), + language: None, + }, + Encoding { + name: "windows-1257", + is_multi_byte_encoding: false, + aliases: &["cp1257", "windows-1257", "x-cp1257"], + encoder_impl: Some(encoding_rs::WINDOWS_1257), + language: None, + }, + Encoding { + name: "windows-1258", + is_multi_byte_encoding: false, + aliases: &["cp1258", "windows-1258", "x-cp1258"], + encoder_impl: Some(encoding_rs::WINDOWS_1258), + language: None, + }, + Encoding { + name: "x-mac-cyrillic", + is_multi_byte_encoding: false, + aliases: &["x-mac-cyrillic", "x-mac-ukrainian"], + encoder_impl: Some(encoding_rs::X_MAC_CYRILLIC), + language: None, + }, + Encoding { + name: "gbk", + is_multi_byte_encoding: true, + aliases: &[ + "chinese", + "csgb2312", + "csiso58gb231280", + "gb2312", + "gb_2312", + "gb_2312-80", + "gbk", + "iso-ir-58", + "x-gbk", + ], + encoder_impl: Some(encoding_rs::GBK), + language: Some(Language::Chinese), + }, + Encoding { + name: "gb18030", + is_multi_byte_encoding: true, + aliases: &["gb18030"], + encoder_impl: Some(encoding_rs::GB18030), + language: Some(Language::Chinese), + }, + Encoding { + name: "big5", + is_multi_byte_encoding: true, + aliases: &["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"], + encoder_impl: Some(encoding_rs::BIG5), + language: Some(Language::Chinese), + }, + Encoding { + name: "euc-jp", + is_multi_byte_encoding: true, + aliases: &["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"], + encoder_impl: Some(encoding_rs::EUC_JP), + language: Some(Language::Japanese), + }, + Encoding { + name: "iso-2022-jp", + is_multi_byte_encoding: true, + aliases: &["csiso2022jp", "iso-2022-jp"], + encoder_impl: Some(encoding_rs::ISO_2022_JP), + language: Some(Language::Japanese), + }, + Encoding { + name: "shift_jis", + is_multi_byte_encoding: true, + aliases: &[ + "csshiftjis", + "ms932", + "ms_kanji", + "shift-jis", + "shift_jis", + "sjis", + "windows-31j", + "x-sjis", + ], + encoder_impl: Some(encoding_rs::SHIFT_JIS), + language: Some(Language::Japanese), + }, + Encoding { + name: "euc-kr", + is_multi_byte_encoding: true, + aliases: &[ + "cseuckr", + "csksc56011987", + "euc-kr", + "iso-ir-149", + "korean", + "ks_c_5601-1987", + "ks_c_5601-1989", + "ksc5601", + "ksc_5601", + "windows-949", + ], + encoder_impl: Some(encoding_rs::EUC_KR), + language: Some(Language::Korean), + }, + Encoding { + name: "utf-16be", + is_multi_byte_encoding: true, + aliases: &["unicodefffe", "utf-16be"], + encoder_impl: Some(encoding_rs::UTF_16BE), + language: None, + }, + Encoding { + name: "utf-16le", + is_multi_byte_encoding: true, + aliases: &[ + "csunicode", + "iso-10646-ucs-2", + "ucs-2", + "unicode", + "unicodefeff", + "utf-16", + "utf-16le", + ], + encoder_impl: Some(encoding_rs::UTF_16LE), + language: None, + }, + Encoding { + name: "utf-8", + is_multi_byte_encoding: true, + aliases: &[ + "unicode-1-1-utf-8", + "unicode11utf8", + "unicode20utf8", + "utf-8", + "utf8", + "x-unicode20utf8", + ], + encoder_impl: Some(encoding_rs::UTF_8), + language: None, + }, +]; diff --git a/src/entity.rs b/src/entity.rs index 8c67b5c..decd7ea 100644 --- a/src/entity.rs +++ b/src/entity.rs @@ -1,11 +1,10 @@ #![allow(unused_variables)] -use crate::cd::{encoding_languages, mb_encoding_languages}; -use crate::consts::{IANA_SUPPORTED_ALIASES, TOO_BIG_SEQUENCE}; -use crate::utils::{decode, iana_name, is_multi_byte_encoding, range_scan}; -use encoding::DecoderTrap; +use crate::cd::encoding_languages; +use crate::consts::TOO_BIG_SEQUENCE; +use crate::enc::{Encoding, IsChunk, WantDecode}; +use crate::utils::range_scan; use ordered_float::OrderedFloat; -use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::fmt::{Debug, Display, Formatter}; @@ -16,7 +15,7 @@ use std::ops::Index; // Languages ///////////////////////////////////////////////////////////////////////////////////// -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] pub enum Language { English, German, @@ -83,8 +82,8 @@ pub(crate) type CoherenceMatches = Vec; #[derive(Clone)] pub struct CharsetMatch { - payload: Cow<'static, [u8]>, - encoding: String, + encoding: &'static Encoding, + payload_len: usize, mean_mess_ratio: OrderedFloat, coherence_matches: CoherenceMatches, @@ -97,21 +96,21 @@ pub struct CharsetMatch { impl Display for CharsetMatch { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{:?} ({})", self.payload, self.encoding) + write!(f, "{:?} ({})", self.decoded_payload, self.encoding) } } impl Debug for CharsetMatch { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{:?} ({})", self.payload, self.encoding) + write!(f, "{:?} ({})", self.decoded_payload, self.encoding) } } impl Default for CharsetMatch { fn default() -> Self { CharsetMatch { - payload: Cow::Borrowed(&[]), - encoding: "utf-8".to_string(), + encoding: Encoding::by_name("utf-8").expect("have utf8"), + payload_len: 0, mean_mess_ratio: OrderedFloat(0.0), coherence_matches: vec![], has_sig_or_bom: false, @@ -161,22 +160,23 @@ impl PartialOrd for CharsetMatch { impl CharsetMatch { // Init function pub(crate) fn new( - payload: Cow<'static, [u8]>, - encoding: &str, + payload: &[u8], + encoding: &'static Encoding, mean_mess_ratio: f32, has_sig_or_bom: bool, coherence_matches: &CoherenceMatches, decoded_payload: Option<&str>, ) -> Self { CharsetMatch { - payload: payload.clone(), - encoding: String::from(encoding), + encoding, + payload_len: payload.len(), mean_mess_ratio: OrderedFloat(mean_mess_ratio), coherence_matches: coherence_matches.clone(), has_sig_or_bom, submatch: vec![], decoded_payload: decoded_payload.map(String::from).or_else(|| { - decode(&payload, encoding, DecoderTrap::Strict, false, true) + encoding + .decode(payload, WantDecode::Yes, IsChunk::Yes) .ok() .map(|res| res.strip_prefix('\u{feff}').unwrap_or(&res).to_string()) }), @@ -189,79 +189,86 @@ impl CharsetMatch { //self.decoded_payload = None; } - // Get encoding aliases according to https://encoding.spec.whatwg.org/encodings.json - pub fn encoding_aliases(&self) -> Vec<&'static str> { - IANA_SUPPORTED_ALIASES - .get(self.encoding.as_str()) - .cloned() - .expect("Problem with static HashMap IANA_SUPPORTED_ALIASES") + /// Get encoding aliases according to + pub fn encoding_aliases(&self) -> &'static [&'static str] { + self.encoding.aliases() } - // byte_order_mark + + /// Did this match have a byte order mark? pub fn bom(&self) -> bool { self.has_sig_or_bom } - pub fn encoding(&self) -> &str { - &self.encoding + + pub fn encoding(&self) -> &'static Encoding { + self.encoding } pub fn chaos(&self) -> f32 { self.mean_mess_ratio.0 } - // Most probable language found in decoded sequence. If none were detected or inferred, the property will return - // Language::Unknown + + /// Most probable language found in decoded sequence. If none were detected or inferred, the property will return + /// Language::Unknown pub fn most_probably_language(&self) -> &'static Language { self.coherence_matches.first().map_or_else( // Default case: Trying to infer the language based on the given encoding || { - if self.suitable_encodings().contains(&String::from("ascii")) { + if self + .suitable_encodings() + .iter() + .any(|enc| enc.name() == "ascii") + { &Language::English } else { - let languages = if is_multi_byte_encoding(&self.encoding) { - mb_encoding_languages(&self.encoding) + let language = if self.encoding.is_multi_byte_encoding() { + self.encoding.language() } else { - encoding_languages(self.encoding.clone()) + encoding_languages(self.encoding.name()).first().copied() }; - languages.first().copied().unwrap_or(&Language::Unknown) + language.unwrap_or(&Language::Unknown) } }, |lang| lang.language, ) } - // Return the complete list of possible languages found in decoded sequence. - // Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. + + /// Return the complete list of possible languages found in decoded sequence. + /// Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. pub fn languages(&self) -> Vec<&'static Language> { self.coherence_matches .iter() .map(|cm| cm.language) .collect() } - // Has submatch + + /// Has submatch pub fn has_submatch(&self) -> bool { !self.submatch.is_empty() } - // Return submatch list + + /// Return submatch list pub fn submatch(&self) -> &Vec { &self.submatch } - // Multibyte usage ratio + + /// Multibyte usage ratio pub fn multi_byte_usage(&self) -> f32 { let decoded_chars = self.decoded_payload().unwrap_or_default().chars().count() as f32; - let payload_len = self.payload.len() as f32; + let payload_len = self.payload_len as f32; 1.0 - (decoded_chars / payload_len) } - // Original untouched bytes - pub fn raw(&self) -> &[u8] { - &self.payload - } - // Return chaos in percents with rounding + + /// Return chaos in percents with rounding pub fn chaos_percents(&self) -> f32 { self.chaos() * 100.0 } - // Return coherence in percents with rounding + + /// Return coherence in percents with rounding pub fn coherence_percents(&self) -> f32 { self.coherence() * 100.0 } - // Most relevant language coherence + + /// Most relevant language coherence pub fn coherence(&self) -> f32 { self.coherence_matches .first() @@ -269,19 +276,20 @@ impl CharsetMatch { .unwrap_or_default() } - // To recalc decoded_payload field + /// Returns the payload decoded into a string pub fn decoded_payload(&self) -> Option<&str> { self.decoded_payload.as_deref() } - // The complete list of encodings that output the exact SAME str result and therefore could be the originating - // encoding. This list does include the encoding available in property 'encoding'. - pub fn suitable_encodings(&self) -> Vec { - std::iter::once(self.encoding.clone()) - .chain(self.submatch.iter().map(|s| s.encoding.clone())) + /// The complete list of encodings that output the exact SAME str result and therefore could be the originating + /// encoding. This list does include the encoding available in property 'encoding'. + pub fn suitable_encodings(&self) -> Vec<&'static Encoding> { + std::iter::once(self.encoding) + .chain(self.submatch.iter().map(|s| s.encoding)) .collect() } - // Returns sorted list of unicode ranges (if exists) + + /// Returns sorted list of unicode ranges (if exists) pub fn unicode_ranges(&self) -> Vec { let mut ranges: Vec = range_scan(self.decoded_payload().unwrap_or_default()) .iter() @@ -326,7 +334,7 @@ impl CharsetMatches { pub fn append(&mut self, item: CharsetMatch) { // We should disable the submatch factoring when the input file is too heavy // (conserve RAM usage) - if item.payload.len() <= TOO_BIG_SEQUENCE { + if item.payload_len <= TOO_BIG_SEQUENCE { for m in &mut self.items { if m.decoded_payload() == item.decoded_payload() && (m.mean_mess_ratio - item.mean_mess_ratio).abs() < f32::EPSILON @@ -345,10 +353,10 @@ impl CharsetMatches { } // Retrieve a single item either by its position or encoding name (alias may be used here). pub fn get_by_encoding(&self, encoding: &str) -> Option<&CharsetMatch> { - let encoding = iana_name(encoding)?; + let encoding = Encoding::by_name(encoding)?; self.items .iter() - .find(|&i| i.suitable_encodings().contains(&encoding.to_string())) + .find(|&i| i.suitable_encodings().contains(&encoding)) } // Resort items by relevancy (for internal use) fn resort(items: &mut [CharsetMatch]) { diff --git a/src/lib.rs b/src/lib.rs index 5fa3a7c..c66427d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,7 +33,7 @@ //! let result = from_bytes(&vec![0x84, 0x31, 0x95, 0x33], None).unwrap(); //! let best_guess = result.get_best(); //! assert_eq!( -//! best_guess.unwrap().encoding(), +//! best_guess.unwrap().encoding().name(), //! "gb18030", //! ); //! } @@ -48,7 +48,7 @@ //! let result = from_path(Path::new("src/tests/data/samples/sample-chinese.txt"), None).unwrap(); //! let best_guess = result.get_best(); //! assert_eq!( -//! best_guess.unwrap().encoding(), +//! best_guess.unwrap().encoding().name(), //! "big5", //! ); //! } @@ -128,19 +128,14 @@ //! "is_preferred": true //! } //! ``` -use crate::cd::{ - coherence_ratio, encoding_languages, mb_encoding_languages, merge_coherence_ratios, -}; -use crate::consts::{IANA_SUPPORTED, MAX_PROCESSED_BYTES, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE}; +use crate::cd::{coherence_ratio, encoding_languages, merge_coherence_ratios}; +use crate::consts::{MAX_PROCESSED_BYTES, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE}; +use crate::enc::{IsChunk, WantDecode}; use crate::entity::{CharsetMatch, CharsetMatches, CoherenceMatches, NormalizerSettings}; use crate::md::mess_ratio; -use crate::utils::{ - any_specified_encoding, decode, iana_name, identify_sig_or_bom, is_cp_similar, - is_invalid_chunk, is_multi_byte_encoding, -}; -use encoding::DecoderTrap; +use crate::utils::{any_specified_encoding, identify_sig_or_bom, is_cp_similar, is_invalid_chunk}; use log::{debug, trace}; -use std::borrow::Cow; +use std::collections::HashSet; use std::collections::VecDeque; use std::fs::File; use std::io::Read; @@ -151,11 +146,15 @@ pub mod assets; #[allow(clippy::cast_lossless, clippy::cast_precision_loss)] mod cd; pub mod consts; +mod enc; pub mod entity; mod md; mod tests; pub mod utils; +pub use enc::Encoding; +pub use enc::ALL; + /// Given a raw bytes sequence, return the best possibles charset usable to render str objects. /// If there is no results, it is a strong indicator that the source is binary/not text. /// By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence. @@ -175,49 +174,32 @@ pub fn from_bytes( bytes: &[u8], settings: Option, ) -> Result { + // check for empty + let bytes_length = bytes.len(); + if bytes_length == 0 { + debug!("Encoding detection on empty bytes, assuming utf_8 intention."); + return Ok(CharsetMatches::from_single(CharsetMatch::default())); + } + // init settings with default values if it's None and recheck include_encodings and // exclude_encodings settings let mut settings = settings.unwrap_or_default(); - if !settings.include_encodings.is_empty() { - let mut normalized = vec![]; - for enc in &settings.include_encodings { - normalized.push( - iana_name(enc) - .ok_or_else(|| format!("included {enc} is not a valid encoding name"))? - .to_string(), - ); - } - settings.include_encodings = normalized; - trace!( - "include_encodings is set. Use this flag for debugging purpose. \ - Limited list of encoding allowed : {}.", - settings.include_encodings.join(", ") + + let mut include_encodings = HashSet::new(); + let mut exclude_encodings = HashSet::new(); + for enc_name in &settings.include_encodings { + include_encodings.insert( + Encoding::by_name(enc_name) + .ok_or_else(|| format!("included {enc_name} is not a valid encoding name"))?, ); } - if !settings.exclude_encodings.is_empty() { - let mut normalized = vec![]; - for enc in &settings.exclude_encodings { - normalized.push( - iana_name(enc) - .ok_or_else(|| format!("excluded encoding {enc} is not a valid encoding name"))? - .to_string(), - ); - } - settings.exclude_encodings = normalized; - trace!( - "exclude_encodings is set. Use this flag for debugging purpose. \ - Limited list of encoding allowed : {}.", - settings.exclude_encodings.join(", ") + for enc_name in &settings.exclude_encodings { + exclude_encodings.insert( + Encoding::by_name(enc_name) + .ok_or_else(|| format!("excluded {enc_name} is not a valid encoding name"))?, ); } - // check for empty - let bytes_length = bytes.len(); - if bytes_length == 0 { - debug!("Encoding detection on empty bytes, assuming utf_8 intention."); - return Ok(CharsetMatches::from_single(CharsetMatch::default())); - } - // check min length if bytes_length <= (settings.chunk_size * settings.steps) { trace!( @@ -253,18 +235,18 @@ pub fn from_bytes( } // start to build prioritized encodings array - let mut prioritized_encodings: Vec<&str> = vec![]; + let mut prioritized_encodings: Vec<&Encoding> = vec![]; // search for encoding in the content - let mut specified_encoding: String = String::new(); + let mut specified_encoding: Option<&Encoding> = None; if settings.preemptive_behaviour { - if let Some(enc) = any_specified_encoding(bytes, 4096) { - trace!( - "Detected declarative mark in sequence. Priority +1 given for {}.", - &enc - ); - specified_encoding = enc.to_string(); - prioritized_encodings.push(&specified_encoding); + if let Some(enc_name) = any_specified_encoding(bytes, 4096) { + trace!("Detected declarative mark in sequence. Priority +1 given for {enc_name}.",); + + if let Some(enc) = Encoding::by_name(&enc_name) { + specified_encoding.replace(enc); + prioritized_encodings.push(enc); + } } } @@ -272,18 +254,18 @@ pub fn from_bytes( let (sig_encoding, sig_payload) = identify_sig_or_bom(bytes); if let (Some(sig_enc), Some(sig_pay)) = (&sig_encoding, sig_payload) { trace!( - "Detected a SIG or BOM mark on first {} byte(s). Priority +1 given for {}.", + "Detected a SIG or BOM mark on first {} byte(s). Priority +1 given for {sig_enc}.", sig_pay.len(), - sig_enc, ); prioritized_encodings.push(sig_enc); } // add ascii & utf-8 - prioritized_encodings.extend(&["ascii", "utf-8"]); + prioritized_encodings.push(Encoding::by_name("ascii").expect("valid")); + prioritized_encodings.push(Encoding::by_name("utf-8").expect("valid")); // generate array of encodings for probing with prioritizing - let mut iana_encodings: VecDeque<&str> = VecDeque::from(IANA_SUPPORTED.clone()); + let mut iana_encodings: VecDeque<&Encoding> = crate::enc::ALL.iter().collect(); for pe in prioritized_encodings.iter().rev() { if let Some(index) = iana_encodings.iter().position(|x| x == pe) { let value = iana_encodings.remove(index).expect("index found above"); @@ -292,33 +274,26 @@ pub fn from_bytes( } // Main processing loop variables - let mut tested_but_hard_failure: Vec<&str> = vec![]; - let mut tested_but_soft_failure: Vec<&str> = vec![]; + let mut tested_but_hard_failure: Vec<&Encoding> = vec![]; + let mut tested_but_soft_failure: Vec<&Encoding> = vec![]; let mut fallback_ascii: Option = None; let mut fallback_u8: Option = None; let mut fallback_specified: Option = None; let mut results: CharsetMatches = CharsetMatches::default(); - let bytes: Cow<'static, [u8]> = Cow::Owned(bytes.to_vec()); - // Iterate and probe our encodings 'iana_encodings_loop: for encoding_iana in iana_encodings { - if (!settings.include_encodings.is_empty() - && !settings - .include_encodings - .contains(&encoding_iana.to_string())) - || settings - .exclude_encodings - .contains(&encoding_iana.to_string()) + if (!include_encodings.is_empty() && !include_encodings.contains(&encoding_iana)) + || exclude_encodings.contains(&encoding_iana) { continue; } - let bom_or_sig_available: bool = sig_encoding.as_deref() == Some(encoding_iana); + let bom_or_sig_available: bool = sig_encoding == Some(encoding_iana); // let strip_sig_or_bom = true // unlike python version this is always true in rust - let is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana); + let is_multi_byte_decoder: bool = encoding_iana.is_multi_byte_encoding(); // utf-16le & utf-16be cannot be identified without BOM - if !bom_or_sig_available && ["utf-16le", "utf-16be"].contains(&encoding_iana) { + if !bom_or_sig_available && encoding_iana.requires_bom() { trace!( "Encoding {} won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE", encoding_iana, @@ -337,12 +312,14 @@ pub fn from_bytes( true => MAX_PROCESSED_BYTES, false => bytes_length, }; - let decoded_payload: Option = if let Ok(payload) = decode( + let decoded_payload: Option = if let Ok(payload) = encoding_iana.decode( &bytes[start_idx..end_idx], - encoding_iana, - DecoderTrap::Strict, - is_too_large_sequence && !is_multi_byte_decoder, - false, + if is_too_large_sequence && !is_multi_byte_decoder { + WantDecode::No + } else { + WantDecode::Yes + }, + IsChunk::No, ) { (!is_too_large_sequence || is_multi_byte_decoder).then_some(payload) } else { @@ -357,7 +334,7 @@ pub fn from_bytes( // soft failed pre-check // important thing! it occurs sometimes fail detection for encoding_soft_failed in &tested_but_soft_failure { - if is_cp_similar(encoding_iana, encoding_soft_failed) { + if is_cp_similar(encoding_iana.name(), encoding_soft_failed.name()) { trace!("{} is deemed too similar to code page {} and was consider unsuited already. Continuing!", encoding_iana, encoding_soft_failed, @@ -374,9 +351,12 @@ pub fn from_bytes( // detect target languages let target_languages = if is_multi_byte_decoder { - mb_encoding_languages(encoding_iana) + encoding_iana + .language() + .map(|lang| vec![lang]) + .unwrap_or_default() } else { - encoding_languages(encoding_iana.to_string()) + encoding_languages(encoding_iana.name()) }; trace!( "{} should target any language(s) of {:?}", @@ -408,12 +388,10 @@ pub fn from_bytes( .take(settings.chunk_size) .collect()), // Bytes processing - None => decode( + None => encoding_iana.decode( &bytes[offset..(offset + settings.chunk_size).min(seq_len)], - encoding_iana, - DecoderTrap::Strict, - false, - false, + WantDecode::Yes, + IsChunk::No, ), }; @@ -447,13 +425,8 @@ pub fn from_bytes( // We might want to check the remainder of sequence // Only if initial MD tests passes if !lazy_str_hard_failure && is_too_large_sequence && !is_multi_byte_decoder { - let decoded_chunk_result = decode( - &bytes[MAX_PROCESSED_BYTES..], - encoding_iana, - DecoderTrap::Strict, - false, - false, - ); + let decoded_chunk_result = + encoding_iana.decode(&bytes[MAX_PROCESSED_BYTES..], WantDecode::Yes, IsChunk::No); if is_invalid_chunk(&decoded_chunk_result, encoding_iana) { trace!( "LazyStr Loading: After final lookup, code page {} does not fit \ @@ -487,7 +460,7 @@ pub fn from_bytes( && prioritized_encodings.contains(&encoding_iana) { let fallback_entry = Some(CharsetMatch::new( - bytes.clone(), + bytes, encoding_iana, f32::from(settings.threshold), false, @@ -495,10 +468,12 @@ pub fn from_bytes( decoded_payload.as_deref(), )); - match encoding_iana { - e if e == specified_encoding => fallback_specified = fallback_entry, - "ascii" => fallback_ascii = fallback_entry, - _ => fallback_u8 = fallback_entry, + if Some(encoding_iana) == specified_encoding { + fallback_specified = fallback_entry; + } else if encoding_iana.name() == "ascii" { + fallback_ascii = fallback_entry; + } else { + fallback_u8 = fallback_entry; } } continue 'iana_encodings_loop; @@ -513,7 +488,7 @@ pub fn from_bytes( // We shall skip the CD when its about ASCII // Most of the time its not relevant to run "language-detection" on it. let mut cd_ratios: Vec = vec![]; - if encoding_iana != "ascii" { + if encoding_iana.name() != "ascii" { cd_ratios.extend(md_chunks.iter().filter_map(|chunk| { coherence_ratio( chunk.clone(), @@ -536,7 +511,7 @@ pub fn from_bytes( // process results results.append(CharsetMatch::new( - bytes.clone(), + bytes, encoding_iana, mean_mess_ratio, bom_or_sig_available, @@ -545,7 +520,10 @@ pub fn from_bytes( )); if (mean_mess_ratio < 0.1 && prioritized_encodings.contains(&encoding_iana)) - || encoding_iana == sig_encoding.clone().unwrap_or_default() + || sig_encoding + .as_ref() + .map(|&enc| enc == encoding_iana) + .unwrap_or(false) { debug!( "Encoding detection: {} is most likely the one.", @@ -553,7 +531,7 @@ pub fn from_bytes( ); return Ok(CharsetMatches::from_single( results - .get_by_encoding(encoding_iana) + .get_by_encoding(encoding_iana.name()) .ok_or_else(|| format!("{encoding_iana} entry not present"))? .clone(), )); diff --git a/src/md/structs.rs b/src/md/structs.rs index e6e0a90..dbb43c0 100644 --- a/src/md/structs.rs +++ b/src/md/structs.rs @@ -1,6 +1,6 @@ use bitflags::bitflags; use cached::proc_macro::cached; -use cached::UnboundCache; +use cached::SizedCache; use icu_properties::{maps, sets, GeneralCategory, GeneralCategoryGroup, Script}; use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION}; @@ -65,8 +65,8 @@ impl MessDetectorChar { } #[cached( - ty = "UnboundCache", - create = "{ UnboundCache::with_capacity(UTF8_MAXIMAL_ALLOCATION) }", + ty = "SizedCache", + create = "{ SizedCache::with_size(UTF8_MAXIMAL_ALLOCATION) }", convert = r#"{ character }"# )] fn new_mess_detector_character(character: char) -> MessDetectorChar { diff --git a/src/normalizer.rs b/src/normalizer.rs index dcdbd7c..5d236c8 100644 --- a/src/normalizer.rs +++ b/src/normalizer.rs @@ -132,8 +132,8 @@ fn normalizer(args: &CLINormalizerArgs) -> Result { alternative_encodings: m .suitable_encodings() .iter() - .filter(|&e| e != m.encoding()) - .cloned() + .filter(|&&e| e != m.encoding()) + .map(|e| e.name().to_string()) .collect(), language: format!("{}", m.most_probably_language()), alphabets: m.unicode_ranges(), @@ -154,7 +154,7 @@ fn normalizer(args: &CLINormalizerArgs) -> Result { // normalizing if need if args.normalize { - if best_guess.encoding().starts_with("utf") { + if best_guess.encoding().name().starts_with("utf") { eprintln!( "{:?} file does not need to be normalized, as it already came from unicode.", full_path, diff --git a/src/performance.rs b/src/performance.rs index f0335b9..5119634 100644 --- a/src/performance.rs +++ b/src/performance.rs @@ -1,10 +1,8 @@ use chardetng::EncodingDetector; -use charset_normalizer_rs::consts::CHARDET_CORRESPONDENCE; use charset_normalizer_rs::from_bytes; use charset_normalizer_rs::utils::get_large_test_datasets; +use charset_normalizer_rs::Encoding; use clap::Parser; -use encoding::label::encoding_from_whatwg_label; -use encoding::DecoderTrap; use log::trace; use std::collections::{BTreeMap, HashMap}; use std::fs::File; @@ -47,12 +45,12 @@ fn check_result( // if correct encoding wasn't found we will try to decode and compare results let whatwg_correct_encoding = correct_encodings .first() - .and_then(|enc| encoding_from_whatwg_label(enc)); - let whatwg_guessed_encoding = encoding_from_whatwg_label(guessed_encoding); + .and_then(|enc| Encoding::by_name(enc)); + let whatwg_guessed_encoding = Encoding::by_name(guessed_encoding); match (whatwg_correct_encoding, whatwg_guessed_encoding) { (Some(correct_encoding), Some(guessed_encoding)) => { - let correct_decoded = correct_encoding.decode(buffer.as_slice(), DecoderTrap::Strict); - let guessed_decoded = guessed_encoding.decode(buffer.as_slice(), DecoderTrap::Strict); + let correct_decoded = correct_encoding.decode_simple(buffer.as_slice()); + let guessed_decoded = guessed_encoding.decode_simple(buffer.as_slice()); match (correct_decoded, guessed_decoded) { (Ok(correct_result), Ok(guessed_result)) => correct_result == guessed_result, _ => false, @@ -130,11 +128,8 @@ fn performance_compare(args: &PerformanceArgs) -> i32 { "B) chardet", Box::new(|bytes: &Vec| { let detected = &chardet::detect(bytes).0.to_ascii_lowercase(); - let alternative = CHARDET_CORRESPONDENCE.get(&detected.as_str()); - if let Some(r) = encoding_from_whatwg_label(&detected) { - r.whatwg_name() - .unwrap_or(alternative.unwrap_or(&r.name())) - .to_string() + if let Some(r) = Encoding::by_name(&detected) { + r.name().to_string() } else { String::from("None") } diff --git a/src/tests/cd.rs b/src/tests/cd.rs index 57d58c5..0ed4ad3 100644 --- a/src/tests/cd.rs +++ b/src/tests/cd.rs @@ -44,7 +44,7 @@ fn test_encoding_languages() { ("windows-1255", Language::Hebrew), ]; for (input, lang) in tests { - let languages = encoding_languages(input.to_string()); + let languages = encoding_languages(input); assert!(languages.contains(&&lang)); } } diff --git a/src/tests/data/largesets/ascii/CHANGELOG.md b/src/tests/data/largesets/utf-8/CHANGELOG.md similarity index 100% rename from src/tests/data/largesets/ascii/CHANGELOG.md rename to src/tests/data/largesets/utf-8/CHANGELOG.md diff --git a/src/tests/data/largesets/ascii/_ude_1.rst b/src/tests/data/largesets/utf-8/_ude_1.rst similarity index 100% rename from src/tests/data/largesets/ascii/_ude_1.rst rename to src/tests/data/largesets/utf-8/_ude_1.rst diff --git a/src/tests/data/largesets/ascii/iris-utf-8.csv b/src/tests/data/largesets/utf-8/iris-utf-8.csv similarity index 100% rename from src/tests/data/largesets/ascii/iris-utf-8.csv rename to src/tests/data/largesets/utf-8/iris-utf-8.csv diff --git a/src/tests/data/largesets/ascii/iris-utf-8.json b/src/tests/data/largesets/utf-8/iris-utf-8.json similarity index 100% rename from src/tests/data/largesets/ascii/iris-utf-8.json rename to src/tests/data/largesets/utf-8/iris-utf-8.json diff --git a/src/tests/detection_base.rs b/src/tests/detection_base.rs index b0b1544..256ff2d 100644 --- a/src/tests/detection_base.rs +++ b/src/tests/detection_base.rs @@ -1,7 +1,6 @@ use crate::entity::NormalizerSettings; use crate::from_bytes; use crate::utils::encode; -use encoding::EncoderTrap; #[test] fn test_empty() { @@ -14,7 +13,7 @@ fn test_empty() { "Empty bytes payload SHOULD NOT return None" ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "utf-8", "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)" ); @@ -39,17 +38,11 @@ fn test_empty_but_with_bom_or_sig() { &input ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), expected_encoding, "Empty detection but with SIG/BOM is wrongly detected! Input: {:?}", &input ); - assert_eq!( - best_guess.unwrap().raw(), - &input, - "The RAW property should contain the original payload given for detection. Input: {:?}", - &input - ); assert!( best_guess.unwrap().bom(), "The BOM/SIG property should return True. Input: {:?}", @@ -66,37 +59,12 @@ fn test_empty_but_with_bom_or_sig() { #[test] fn test_content_with_bom_or_sig() { - let tests = [ - ( - encode( - "\u{FEFF}我没有埋怨,磋砣的只是一些时间。", - "gb18030", - EncoderTrap::Ignore, - ) - .unwrap(), - "gb18030", - ), - ( - encode( - "\u{FEFF}我没有埋怨,磋砣的只是一些时间。", - "utf-16le", - EncoderTrap::Ignore, - ) - .unwrap(), - "utf-16le", - ), - ( - encode( - "\u{FEFF}我没有埋怨,磋砣的只是一些时间。", - "utf-8", - EncoderTrap::Ignore, - ) - .unwrap(), - "utf-8", - ), - ]; + let input_utf8 = "\u{FEFF}我没有埋怨,磋砣的只是一些时间。"; + let tests = ["gb18030", "utf-16le", "utf-8"]; + let ignore_errors = true; - for (input, expected_encoding) in tests { + for encoding_name in tests { + let input = encode(input_utf8, encoding_name, ignore_errors).unwrap(); let result = from_bytes(&input, None).unwrap(); let best_guess = result.get_best(); assert!( @@ -105,8 +73,8 @@ fn test_content_with_bom_or_sig() { &input ); assert_eq!( - best_guess.unwrap().encoding(), - expected_encoding, + best_guess.unwrap().encoding().name(), + encoding_name, "Detection but with SIG/BOM is wrongly detected! Input: {:?}", &input ); @@ -137,7 +105,7 @@ fn test_obviously_ascii_content() { &input ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "ascii", "Dead-simple ASCII detection is wrongly detected! Input: {:?}", &input @@ -169,7 +137,7 @@ fn test_obviously_utf8_content() { &input ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "utf-8", "Dead-simple UTF-8 detection is wrongly detected! Input: {:?}", &input @@ -195,5 +163,5 @@ fn test_mb_cutting_chk() { let result = from_bytes(payload.as_slice(), Some(settings)).unwrap(); let best_guess = result.get_best().unwrap(); assert_eq!(result.len(), 1); - assert_eq!(best_guess.encoding(), "euc-kr"); + assert_eq!(best_guess.encoding().name(), "euc-kr"); } diff --git a/src/tests/detection_edge_case.rs b/src/tests/detection_edge_case.rs index a2bab77..a827f2e 100644 --- a/src/tests/detection_edge_case.rs +++ b/src/tests/detection_edge_case.rs @@ -13,7 +13,7 @@ fn test_undefined_unicode_ranges() { &input ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "utf-8", "UTF-8 payload wrongly detected! Input: {:?}", &input diff --git a/src/tests/detection_full.rs b/src/tests/detection_full.rs index bd90b31..5048560 100644 --- a/src/tests/detection_full.rs +++ b/src/tests/detection_full.rs @@ -17,7 +17,7 @@ fn test_elementary_detection() { assert!(result.is_ok()); let result = result.unwrap(); let best_guess = result.get_best(); - let enc = best_guess.unwrap().encoding(); + let enc = best_guess.unwrap().encoding().name(); let languages = best_guess.unwrap().languages(); assert!( @@ -52,14 +52,18 @@ fn test_largesets() { let best_guess = result.get_best(); let mut guess_encoding = "None"; if best_guess.is_some() { - guess_encoding = best_guess.unwrap().encoding(); + guess_encoding = best_guess.unwrap().encoding().name(); } + let fail = !encoding.contains(&guess_encoding.to_string()) && (guess_encoding == "None" || encoding .iter() .any(|x| is_multi_byte_encoding(guess_encoding) != is_multi_byte_encoding(x))); - assert!(!fail, "Problems with {}", path); + assert!( + !fail, + "Problems with {path}. expected encoding={encoding:?}, guess={guess_encoding}", + ); } } diff --git a/src/tests/detection_large_payload.rs b/src/tests/detection_large_payload.rs index 8292c40..bf56b52 100644 --- a/src/tests/detection_large_payload.rs +++ b/src/tests/detection_large_payload.rs @@ -13,16 +13,11 @@ fn test_large_payload_utf8_sig_basic_entry() { "Large U8 payload case detection completely failed" ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "utf-8", "Large U8 payload case detection wrongly detected!" ); assert!(best_guess.unwrap().bom(), "SIG/BOM property should be True"); - assert_eq!( - best_guess.unwrap().raw().len(), - payload.len(), - "Large payload should remain untouched when accessed through .raw" - ); } #[test] @@ -36,7 +31,7 @@ fn test_large_payload_ascii_sig_basic_entry() { "Large ASCII payload case detection completely failed" ); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "ascii", "Large ASCII payload case detection wrongly detected!" ); @@ -44,11 +39,6 @@ fn test_large_payload_ascii_sig_basic_entry() { !best_guess.unwrap().bom(), "SIG/BOM property should be False" ); - assert_eq!( - best_guess.unwrap().raw().len(), - payload.len(), - "Large payload should remain untouched when accessed through .raw" - ); } #[test] @@ -64,7 +54,7 @@ fn test_misleading_large_sequence() { let best_guess = result.get_best(); assert!(best_guess.is_some(), "Best guess is exists"); assert_eq!( - best_guess.unwrap().encoding(), + best_guess.unwrap().encoding().name(), "utf-8", "Best guess is not utf-8" ); diff --git a/src/tests/entity.rs b/src/tests/entity.rs index 589ac62..2988312 100644 --- a/src/tests/entity.rs +++ b/src/tests/entity.rs @@ -1,3 +1,4 @@ +use crate::enc::Encoding; use crate::entity::{CharsetMatch, CharsetMatches, CoherenceMatch, Language}; use ordered_float::OrderedFloat; @@ -8,8 +9,8 @@ fn test_charset_matches() { /////////////////////////////////////////////////////////////////////////////////////////// let mut c_matches = CharsetMatches::new(Some(vec![CharsetMatch::new( - (&[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82]).into(), - "utf-8", + &[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82], + Encoding::by_name("utf-8").unwrap(), 0.01, false, &vec![ @@ -28,8 +29,8 @@ fn test_charset_matches() { // append new CharsetMatch c_matches.append(CharsetMatch::new( - (&[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82]).into(), - "utf-16le", + &[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82], + Encoding::by_name("utf-16le").unwrap(), 0.011, false, &vec![ @@ -48,7 +49,7 @@ fn test_charset_matches() { // check best match assert!(c_matches.get_best().is_some()); - assert_eq!(c_matches.get_best().unwrap().encoding(), "utf-8"); + assert_eq!(c_matches.get_best().unwrap().encoding().name(), "utf-8"); // check get by encoding assert!(c_matches.get_by_encoding("utf-8").is_some()); @@ -62,7 +63,7 @@ fn test_charset_matches() { ); // test indexation impl - assert_eq!(c_matches[0].encoding(), "utf-8"); + assert_eq!(c_matches[0].encoding().name(), "utf-8"); // test iteration let mut i = 0; @@ -86,8 +87,8 @@ fn test_charset_matches() { assert_eq!( c_matches[1], CharsetMatch::new( - (&[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82]).into(), - "utf-16le", + &[0xD0, 0xA2, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x82], + Encoding::by_name("utf-16le").unwrap(), 0.044, true, &vec!( @@ -125,7 +126,7 @@ fn test_charset_matches() { // unicode_ranges for m in c_matches.iter_mut() { - if m.encoding() == "utf-8" { + if m.encoding().name() == "utf-8" { assert!(m.unicode_ranges().contains(&String::from("Cyrillic"))); } else { assert!(m diff --git a/src/tests/md.rs b/src/tests/md.rs index a4b2abb..a538a53 100644 --- a/src/tests/md.rs +++ b/src/tests/md.rs @@ -1,7 +1,7 @@ +use crate::enc::{Encoding, IsChunk, WantDecode}; use crate::md::structs::{MessDetectorChar, MessDetectorCharFlags}; use crate::md::*; -use crate::utils::{decode, get_large_test_datasets}; -use encoding::DecoderTrap; +use crate::utils::get_large_test_datasets; use ordered_float::OrderedFloat; use std::fs::File; use std::io::Read; @@ -34,7 +34,7 @@ fn test_mess_ratio() { #[test] fn test_datasets_mess_ratio() { - for (path, encoding) in &get_large_test_datasets().unwrap() { + for (path, encoding_names) in &get_large_test_datasets().unwrap() { let file = File::open(path); if file.is_err() { return; @@ -43,15 +43,17 @@ fn test_datasets_mess_ratio() { if file.unwrap().read_to_end(&mut buffer).is_err() { return; } - if let Ok(decoded_sequence) = decode( - &buffer, - encoding.first().unwrap(), - DecoderTrap::Ignore, - false, - false, - ) { - let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0))); - assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path); + match Encoding::by_name(encoding_names.first().unwrap()) { + Some(encoding) => { + if let Ok(decoded_sequence) = encoding.decode(&buffer, WantDecode::Yes, IsChunk::No) + { + let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0))); + assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path); + } + } + None => { + // Ignore invalid names like `None` + } } } } diff --git a/src/tests/utils.rs b/src/tests/utils.rs index 47566cc..33ecc7d 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -1,7 +1,7 @@ +use crate::enc::{Encoding, IsChunk, WantDecode}; use crate::entity::NormalizerSettings; use crate::tests::FILES_SAMPLES; use crate::utils::*; -use encoding::DecoderTrap; use std::fs::File; use std::io::Read; use std::path::PathBuf; @@ -72,14 +72,11 @@ fn test_is_multi_byte_encoding() { #[test] fn test_identify_sig_or_bom() { let tests = [ - ( - b"\xef\xbb\xbf lol kek".as_slice(), - Some("utf-8".to_string()), - ), + (b"\xef\xbb\xbf lol kek".as_slice(), Some("utf-8")), (b"lol kek".as_slice(), None), ]; for test in &tests { - assert_eq!(identify_sig_or_bom(test.0).0, test.1); + assert_eq!(identify_sig_or_bom(test.0).0.map(|enc| enc.name()), test.1); } } @@ -92,7 +89,7 @@ fn test_iana_name() { ("korean", Some("euc-kr")), ]; for test in &tests { - assert_eq!(iana_name(test.0), test.1); + assert_eq!(Encoding::by_name(test.0).map(|e| e.name()), test.1); } } @@ -121,12 +118,17 @@ fn test_any_specified_encoding() { (b"", None), (b"# coding: utf-8", Some("utf-8".to_string())), (b"", Some("utf-8".to_string())), - (b"", Some("windows-1252".to_string())), + (b"", Some("ascii".to_string())), (b"", Some("windows-1252".to_string())), (b"", Some("windows-1256".to_string())), ]; - for test in &tests { - assert_eq!(any_specified_encoding(test.0, 4096), test.1); + for (input, enc_name) in &tests { + assert_eq!( + any_specified_encoding(input, 4096), + *enc_name, + "input={}", + String::from_utf8_lossy(input) + ); } } @@ -201,9 +203,11 @@ fn test_decode_test() { (b"\x61\x52\x6f\x64\x20\x5a\x61\x52\x6f\x64\x20\x5a\xaa\xd8\x80\xd9\x80\xd9\x80\xd9\xb9\xd8\x80\xd9\x80\xd9\x80\xd9\x80\xd9\xaf\xd8\x8a\xd9\x80\xd9\x80\xd9\x84\xd9\xd8\x20\xd9\xa7\xd9\x84\xd9\x80\xd9\x80\xd8\x80\xd9\xaa\xd9\x80\xd9\x80\xd9\x80\xd9\x80\xd9\x88\xd9\x82\xd9\x80\xd9\x80\xd9\x8a\xd9\x80\xd9\x80\xd9\x80\xd8\x80\x20\xaa\x85\xd9\x80\xd9\x80\xd9\x80\xd9\x86\xd9\xd9\x20\xd9\x82\xd9\x80\xd8\x80\xd9\xa8\xd9\x80\xd9\x80\x00\x84".to_vec(), "euc-jp", false), (b"\x61\x52\x6f\x64\x20\x5a\x61\x52\x6f\x64\x20\x5a\xaa\xd8".to_vec(), "windows-1251", true), ]; - for test in &tests { - let res = decode(&test.0, test.1, DecoderTrap::Strict, true, false); - assert_eq!(res.is_ok(), test.2); + for (input, enc_name, expect_pass) in &tests { + let res = Encoding::by_name(enc_name) + .unwrap() + .decode(input, WantDecode::No, IsChunk::No); + assert_eq!(res.is_ok(), *expect_pass); } } @@ -213,25 +217,25 @@ fn test_decode_wrong_chunks() { // and decode it without fail // The idea is that decode function should ignore errors in the beginning and ending of chunk let settings = NormalizerSettings::default(); - for sample in &*FILES_SAMPLES { - if sample.1.iter().any(|e| is_multi_byte_encoding(e)) { + for (sample_file_name, sample_encoding_names, _sample_language) in &*FILES_SAMPLES { + if sample_encoding_names + .iter() + .any(|e| is_multi_byte_encoding(e)) + { let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push(format!("src/tests/data/samples/{}", sample.0)); + path.push(format!("src/tests/data/samples/{sample_file_name}")); let mut file = File::open(path.to_str().unwrap()).expect("Cannot open file"); let mut buffer = Vec::new(); file.read_to_end(&mut buffer).expect("Cannot read file"); + eprintln!("Doing {path:?}"); + let encoding = Encoding::by_name(sample_encoding_names.first().unwrap()).unwrap(); for chunk in buffer.chunks(settings.chunk_size) { - let status = decode( - chunk, - sample.1.first().unwrap(), - DecoderTrap::Strict, - true, - true, - ); + eprintln!("processing chunk of size {}", chunk.len()); + let status = encoding.decode(chunk, WantDecode::No, IsChunk::Yes); assert!( status.is_ok(), "Decode error for sample {}, {}", - sample.0, + sample_file_name, status.unwrap_err() ); } diff --git a/src/utils.rs b/src/utils.rs index 6f25107..e1b547a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,19 +1,14 @@ -#![allow(dead_code)] - -use crate::assets::LANGUAGES; use crate::consts::{ - ENCODING_MARKS, IANA_SUPPORTED, IANA_SUPPORTED_SIMILAR, RE_POSSIBLE_ENCODING_INDICATION, + ENCODING_MARKS, IANA_SUPPORTED_SIMILAR, RE_POSSIBLE_ENCODING_INDICATION, UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, }; -use crate::entity::Language; +use crate::enc::{Encoding, IsChunk, WantDecode}; +use std::cmp::Ordering; use ahash::{HashSet, HashSetExt}; -use encoding::label::encoding_from_whatwg_label; -use encoding::{CodecError, DecoderTrap, EncoderTrap, Encoding, EncodingRef, StringWriter}; use icu_normalizer::DecomposingNormalizer; use unicode_names2::name; -use std::borrow::Cow; #[cfg(any(test, feature = "performance"))] use std::path::{Path, PathBuf}; @@ -59,10 +54,21 @@ pub(crate) fn is_unicode_range_secondary(range_name: &str) -> bool { // Retrieve the Unicode range official name from a single character pub(crate) fn unicode_range(character: char) -> Option<&'static str> { let char_code = character as u32; + + let index = UNICODE_RANGES_COMBINED + .binary_search_by(|(_, range)| { + if char_code < *range.start() { + Ordering::Greater + } else if char_code > *range.end() { + Ordering::Less + } else { + Ordering::Equal + } + }) + .ok()?; UNICODE_RANGES_COMBINED - .iter() - .find(|&(_, range)| range.contains(&char_code)) - .map(|(name, _)| *name) + .get(index) + .map(|(name, _range)| *name) } pub(crate) fn range_scan(decoded_sequence: &str) -> HashSet { @@ -86,66 +92,49 @@ pub(crate) fn remove_accent(ch: char) -> char { // Verify is a specific encoding is a multi byte one based on it IANA name pub fn is_multi_byte_encoding(name: &str) -> bool { - [ - "utf-8", - "utf-16le", - "utf-16be", - "euc-jp", - "euc-kr", - "iso-2022-jp", - "gbk", - "gb18030", - "hz", - "big5", - "shift_jis", - ] - .contains(&name) + Encoding::by_name(name) + .map(|enc| enc.is_multi_byte_encoding()) + .unwrap_or(false) } // Try to detect multibyte encoding by signature -pub(crate) fn identify_sig_or_bom(sequence: &[u8]) -> (Option, Option<&[u8]>) { +pub(crate) fn identify_sig_or_bom(sequence: &[u8]) -> (Option<&Encoding>, Option<&[u8]>) { ENCODING_MARKS .iter() .find(|&(_, enc_sig)| sequence.starts_with(enc_sig)) - .map_or((None, None), |(enc_name, enc_sig)| { - (Some((*enc_name).to_string()), Some(*enc_sig)) - }) -} - -// Try to get standard name by alternative labels -pub fn iana_name(cp_name: &str) -> Option<&str> { - IANA_SUPPORTED - .contains(&cp_name) // first just try to search it in our list - .then_some(cp_name) - .or_else(|| { - // if not found, try to use alternative way - encoding_from_whatwg_label(cp_name).map(|enc| enc.whatwg_name().unwrap_or(enc.name())) - }) + .map_or( + (None, None), + |(enc_name, enc_sig)| match Encoding::by_name(enc_name) { + Some(enc) => (Some(enc), Some(*enc_sig)), + None => (None, Some(*enc_sig)), + }, + ) } pub(crate) fn is_cp_similar(iana_name_a: &str, iana_name_b: &str) -> bool { - IANA_SUPPORTED_SIMILAR.contains_key(iana_name_a) - && IANA_SUPPORTED_SIMILAR[iana_name_a].contains(&iana_name_b) + IANA_SUPPORTED_SIMILAR + .get(iana_name_a) + .map(|candidates| candidates.contains(&iana_name_b)) + .unwrap_or(false) } // Extract using ASCII-only decoder any specified encoding in the first n-bytes. pub(crate) fn any_specified_encoding(sequence: &[u8], search_zone: usize) -> Option { - encoding::all::ASCII - .decode( - &sequence[0..search_zone.min(sequence.len())], - DecoderTrap::Ignore, - ) - .ok() - .and_then(|test_string| { - RE_POSSIBLE_ENCODING_INDICATION - .captures_iter(&test_string) - .map(|c| c.extract()) - .find_map(|(_, [specified_encoding])| iana_name(specified_encoding)) - .map(|found_iana| found_iana.to_string()) + let test_string = &sequence[0..search_zone.min(sequence.len())]; + + RE_POSSIBLE_ENCODING_INDICATION + .captures_iter(test_string) + .map(|c| c.extract()) + .find_map(|(_, [specified_encoding])| { + std::str::from_utf8(specified_encoding) + .ok() + .and_then(Encoding::by_name) }) + .map(|found_iana| found_iana.to_string()) } // Calculate similarity of two single byte encodings +#[allow(dead_code)] pub(crate) fn cp_similarity(iana_name_a: &str, iana_name_b: &str) -> f32 { // we don't want to compare multi-byte encodings if is_multi_byte_encoding(iana_name_a) || is_multi_byte_encoding(iana_name_b) { @@ -153,13 +142,13 @@ pub(crate) fn cp_similarity(iana_name_a: &str, iana_name_b: &str) -> f32 { } if let (Some(encoder_a), Some(encoder_b)) = ( - encoding_from_whatwg_label(iana_name_a), - encoding_from_whatwg_label(iana_name_b), + Encoding::by_name(iana_name_a), + Encoding::by_name(iana_name_b), ) { let character_match_count = (1..255u8) .filter(|&ch| { - let res_a = encoder_a.decode(&[ch], DecoderTrap::Ignore).ok(); - let res_b = encoder_b.decode(&[ch], DecoderTrap::Ignore).ok(); + let res_a = encoder_a.decode(&[ch], WantDecode::Yes, IsChunk::No).ok(); + let res_b = encoder_b.decode(&[ch], WantDecode::Yes, IsChunk::No).ok(); res_a.is_some() && res_a == res_b //check that they aren't none and equal }) .count(); @@ -168,141 +157,12 @@ pub(crate) fn cp_similarity(iana_name_a: &str, iana_name_b: &str) -> f32 { 0.0 // Return 0.0 if encoders could not be retrieved. } -// Test Decoding bytes to string with specified encoding without writing result to memory -// returns true if everything is correctly decoded, otherwise false -struct DecodeTestResult { - only_test: bool, - data: String, -} -impl StringWriter for DecodeTestResult { - fn writer_hint(&mut self, expectedlen: usize) { - if self.only_test { - return; - } - let newlen = self.data.len() + expectedlen; - self.data.reserve(newlen); - } - fn write_char(&mut self, c: char) { - if self.only_test { - return; - } - self.data.push(c); - } - fn write_str(&mut self, s: &str) { - if self.only_test { - return; - } - self.data.push_str(s); - } -} -impl DecodeTestResult { - pub fn get_buffer(&self) -> &str { - &self.data - } -} - -// Decode bytes to string with specified encoding -// if is_chunk = true it will try to fix first and end bytes for multibyte encodings -pub fn decode( - input: &[u8], - from_encoding: &str, - how_process_errors: DecoderTrap, - only_test: bool, - is_chunk: bool, -) -> Result { - let encoder = encoding_from_whatwg_label(from_encoding) - .ok_or(format!("Encoding '{}' not found", from_encoding))?; - - let mut buf = DecodeTestResult { - only_test, - data: String::new(), - }; - let mut err = CodecError { - upto: 0, - cause: Cow::from(String::new()), - }; - let chunk_len = input.len(); - let mut begin_offset: usize = 0; - let mut end_offset: usize = chunk_len; - let mut error_occured: bool; - loop { - let res = decode_to( - encoder, - &input[begin_offset..end_offset], - how_process_errors, - &mut buf, - ); - error_occured = res.is_err(); - if let DecoderTrap::Strict = how_process_errors { - if !is_chunk || res.is_ok() || !is_multi_byte_encoding(from_encoding) { - break; - } - err = res.unwrap_err(); - if err.cause.contains("invalid sequence") { - begin_offset += 1; - } else if err.cause.contains("incomplete sequence") { - end_offset -= 1; - } - if end_offset - begin_offset < 1 || begin_offset > 3 || (chunk_len - end_offset) > 3 { - break; - } - } else { - break; - } - } - if error_occured { - return Err(format!("{} at index {}", err.cause, err.upto)); +/// Encode string to vec of bytes with specified encoding +pub fn encode(input: &str, to_encoding: &str, ignore_errors: bool) -> Result, String> { + match Encoding::by_name(to_encoding) { + Some(enc) => enc.encode(input, ignore_errors), + None => Err(format!("Encoding '{}' not found", to_encoding)), } - Ok(String::from(buf.get_buffer())) -} - -// Copied implementation of decode_to from encoder lib -// (we need index of problematic chars & hacks for chunks) -fn decode_to( - encoder: EncodingRef, - input: &[u8], - trap: DecoderTrap, - ret: &mut dyn StringWriter, -) -> Result<(), CodecError> { - let mut decoder = encoder.raw_decoder(); - let mut remaining = 0; - loop { - let (offset, err) = decoder.raw_feed(&input[remaining..], ret); - let unprocessed = remaining + offset; - - match err { - Some(err) => { - remaining = remaining.wrapping_add_signed(err.upto); - if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) { - return Err(err); - } - } - None => { - remaining = input.len(); - if let Some(err) = decoder.raw_finish(ret) { - remaining = remaining.wrapping_add_signed(err.upto); - if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) { - return Err(err); - } - } - if remaining >= input.len() { - return Ok(()); - } - } - } - } -} - -// Encode string to vec of bytes with specified encoding -pub fn encode( - input: &str, - to_encoding: &str, - how_process_errors: EncoderTrap, -) -> Result, String> { - if let Some(encoder) = encoding_from_whatwg_label(to_encoding) { - return Ok(encoder.encode(input, how_process_errors)?); - } - Err(format!("Encoding '{}' not found", to_encoding)) } // Determine if two Unicode range seen next to each other can be considered as suspicious. @@ -361,24 +221,15 @@ pub(crate) fn is_suspiciously_successive_range( true // if either range is none or edge cases never triggers, return true } -// Get data for specified language -pub(crate) fn get_language_data(language: &Language) -> Result<(&'static str, bool, bool), String> { - for (iterated_language, characters, has_accents, pure_latin) in LANGUAGES.iter() { - if iterated_language == language { - return Ok((characters, *has_accents, *pure_latin)); - } - } - Err(String::from("Language wasn't found")) -} - // ascii in encodings means windows-1252 codepage with supports diacritis // because of this we will check additionally it with is_ascii method pub(super) fn is_invalid_chunk( decoded_chunk_result: &Result, - encoding_iana: &str, + encoding_iana: &Encoding, ) -> bool { decoded_chunk_result.is_err() - || (encoding_iana == "ascii" && !decoded_chunk_result.as_ref().is_ok_and(|s| s.is_ascii())) + || (encoding_iana.name() == "ascii" + && !decoded_chunk_result.as_ref().is_ok_and(|s| s.is_ascii())) } // Get large datasets