Fix deepseek deseret regex (#7369)

On windows compiled with gcc the c++ regex library failed to handle the characters
2025-12-12 00:37:04 +00:00 · 2024-10-26 14:58:54 -07:00
parent d7c94e0ca6
commit 099f7077a1
3 changed files with 88 additions and 1 deletions
--- a/llama/unicode.cpp
+++ b/llama/unicode.cpp
@@ -28,6 +28,11 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif

+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
 #include "unicode.h"
 #include "unicode-data.h"

@@ -227,8 +232,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }

 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
+    if (!wlen) {
+        throw std::invalid_argument("failed to convert regex");
+    }
+    wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+    wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
+    if (!wlen) {
+        free(wbuf);
+        throw std::invalid_argument("failed to convert regex");
+    }
+    std::wstring ret = std::wstring(wbuf);
+    free(wbuf);
+    return ret;
+#else
    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
    return conv.from_bytes(s);
+#endif
 }

 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {