1 /** 2 * UTF-8 ↔ UTF-16 conversion helpers for the Win32 backend. 3 * 4 * D strings are UTF-8; the wide Win32 API (`...W`) speaks UTF-16. These helpers 5 * bridge the two using the platform's own `MultiByteToWideChar` / 6 * `WideCharToMultiByte`, which keeps the library free of any Phobos dependency 7 * (smaller binaries) and matches the OS exactly. Conversions are lenient: 8 * malformed input (for example a lone surrogate) is replaced with U+FFFD rather 9 * than throwing, so a stray value from the OS can never crash the UI. 10 */ 11 module deft.util.strings; 12 13 import core.sys.windows.winnls : CP_UTF8, MultiByteToWideChar, WideCharToMultiByte; 14 15 /** 16 * Convert a D UTF-8 string into a null-terminated UTF-16 buffer suitable for 17 * passing to wide Win32 APIs. 18 * 19 * The returned pointer refers to GC-managed memory; it stays valid as long as 20 * the caller keeps a reference reachable (in practice, for the duration of the 21 * Win32 call it is handed to). 22 */ 23 const(wchar)* toWStringz(string s) @trusted 24 { 25 // Shared, immutable terminator for the common empty-string case — every empty 26 // control caption, tooltip, etc. would otherwise allocate a fresh wchar[1]. 27 static immutable wchar[1] emptyWz = ['\0']; 28 if (s.length == 0) 29 return emptyWz.ptr; 30 31 int needed = MultiByteToWideChar(CP_UTF8, 0, s.ptr, cast(int) s.length, null, 0); 32 auto buf = new wchar[needed + 1]; 33 if (needed > 0) 34 MultiByteToWideChar(CP_UTF8, 0, s.ptr, cast(int) s.length, buf.ptr, needed); 35 buf[needed] = '\0'; 36 return buf.ptr; 37 } 38 39 /** 40 * Convert a null-terminated UTF-16 buffer (as returned by Win32) into a D 41 * UTF-8 string. Stops at the first NUL. A null pointer yields an empty string. 42 */ 43 string fromWStringz(const(wchar)* ws) @system 44 { 45 if (ws is null) 46 return ""; 47 48 size_t len = 0; 49 while (ws[len] != '\0') 50 ++len; 51 52 return fromWString(ws[0 .. len]); 53 } 54 55 /** 56 * Convert a known-length UTF-16 slice into a D UTF-8 string. Embedded NUL 57 * characters are preserved; malformed code units are replaced with U+FFFD. 58 */ 59 string fromWString(const(wchar)[] ws) @trusted 60 { 61 if (ws.length == 0) 62 return ""; 63 64 int needed = WideCharToMultiByte( 65 CP_UTF8, 0, ws.ptr, cast(int) ws.length, null, 0, null, null); 66 if (needed <= 0) 67 return ""; 68 69 auto buf = new char[needed]; 70 WideCharToMultiByte( 71 CP_UTF8, 0, ws.ptr, cast(int) ws.length, buf.ptr, needed, null, null); 72 return cast(string) buf; 73 } 74 75 // These roundtrip tests exercise fromWStringz, which is @system (it walks a 76 // raw pointer), so the blocks themselves cannot be @safe. 77 @system unittest 78 { 79 // ASCII roundtrip. 80 enum ascii = "Hello, world!"; 81 assert(ascii.toWStringz.fromWStringz == ascii); 82 } 83 84 @system unittest 85 { 86 // Cyrillic roundtrip. 87 enum cyrillic = "Привет, мир!"; 88 assert(cyrillic.toWStringz.fromWStringz == cyrillic); 89 } 90 91 @system unittest 92 { 93 // Hebrew roundtrip (right-to-left script). 94 enum hebrew = "שלום עולם"; 95 assert(hebrew.toWStringz.fromWStringz == hebrew); 96 } 97 98 @system unittest 99 { 100 // CJK roundtrip (BMP) plus an astral-plane code point exercising surrogate 101 // pairs (emoji). 102 enum cjk = "日本語テスト 🎉"; 103 assert(cjk.toWStringz.fromWStringz == cjk); 104 } 105 106 @system unittest 107 { 108 // Empty string roundtrips through both directions. 109 assert("".toWStringz.fromWStringz == ""); 110 assert(fromWString([]) == ""); 111 } 112 113 @system unittest 114 { 115 // fromWStringz stops at the first embedded NUL. 116 const(wchar)[] withNul = ['a', 'b', '\0', 'c', 'd']; 117 assert(fromWStringz(withNul.ptr) == "ab"); 118 } 119 120 @safe unittest 121 { 122 import std.algorithm.searching : canFind; 123 import std.utf : validate; 124 125 // A lone high surrogate is malformed UTF-16. Conversion must not throw, must 126 // yield valid UTF-8, and must surface the replacement character (U+FFFD). 127 const(wchar)[] lone = ['a', 0xD800, 'b']; 128 string decoded = fromWString(lone); 129 validate(decoded); // throws if not well-formed UTF-8 130 assert(decoded.length >= 1); 131 assert(decoded[0] == 'a'); 132 assert(decoded.canFind('�')); 133 }