mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 00:37:04 +00:00
Fix deepseek deseret regex (#7369)
On windows compiled with gcc the c++ regex library failed to handle the characters
This commit is contained in:
21
llama/unicode.cpp
vendored
21
llama/unicode.cpp
vendored
@@ -28,6 +28,11 @@
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
@@ -227,8 +232,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
}
|
||||
|
||||
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
throw std::invalid_argument("failed to convert regex");
|
||||
}
|
||||
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
||||
wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
|
||||
if (!wlen) {
|
||||
free(wbuf);
|
||||
throw std::invalid_argument("failed to convert regex");
|
||||
}
|
||||
std::wstring ret = std::wstring(wbuf);
|
||||
free(wbuf);
|
||||
return ret;
|
||||
#else
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||
return conv.from_bytes(s);
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
||||
|
||||
Reference in New Issue
Block a user