deft.util.strings source code

1 /**
2  * UTF-8 ↔ UTF-16 conversion helpers for the Win32 backend.
3  *
4  * D strings are UTF-8; the wide Win32 API (`...W`) speaks UTF-16. These helpers
5  * bridge the two using the platform's own `MultiByteToWideChar` /
6  * `WideCharToMultiByte`, which keeps the library free of any Phobos dependency
7  * (smaller binaries) and matches the OS exactly. Conversions are lenient:
8  * malformed input (for example a lone surrogate) is replaced with U+FFFD rather
9  * than throwing, so a stray value from the OS can never crash the UI.
10  */
11 module deft.util.strings;
12 
13 import core.sys.windows.winnls : CP_UTF8, MultiByteToWideChar, WideCharToMultiByte;
14 
15 /**
16  * Convert a D UTF-8 string into a null-terminated UTF-16 buffer suitable for
17  * passing to wide Win32 APIs.
18  *
19  * The returned pointer refers to GC-managed memory; it stays valid as long as
20  * the caller keeps a reference reachable (in practice, for the duration of the
21  * Win32 call it is handed to).
22  */
23 const(wchar)* toWStringz(string s) @trusted
24 {
25 	// Shared, immutable terminator for the common empty-string case — every empty
26 	// control caption, tooltip, etc. would otherwise allocate a fresh wchar[1].
27 	static immutable wchar[1] emptyWz = ['\0'];
28 	if (s.length == 0)
29 		return emptyWz.ptr;
30 
31 	int needed = MultiByteToWideChar(CP_UTF8, 0, s.ptr, cast(int) s.length, null, 0);
32 	auto buf = new wchar[needed + 1];
33 	if (needed > 0)
34 		MultiByteToWideChar(CP_UTF8, 0, s.ptr, cast(int) s.length, buf.ptr, needed);
35 	buf[needed] = '\0';
36 	return buf.ptr;
37 }
38 
39 /**
40  * Convert a null-terminated UTF-16 buffer (as returned by Win32) into a D
41  * UTF-8 string. Stops at the first NUL. A null pointer yields an empty string.
42  */
43 string fromWStringz(const(wchar)* ws) @system
44 {
45 	if (ws is null)
46 		return "";
47 
48 	size_t len = 0;
49 	while (ws[len] != '\0')
50 		++len;
51 
52 	return fromWString(ws[0 .. len]);
53 }
54 
55 /**
56  * Convert a known-length UTF-16 slice into a D UTF-8 string. Embedded NUL
57  * characters are preserved; malformed code units are replaced with U+FFFD.
58  */
59 string fromWString(const(wchar)[] ws) @trusted
60 {
61 	if (ws.length == 0)
62 		return "";
63 
64 	int needed = WideCharToMultiByte(
65 		CP_UTF8, 0, ws.ptr, cast(int) ws.length, null, 0, null, null);
66 	if (needed <= 0)
67 		return "";
68 
69 	auto buf = new char[needed];
70 	WideCharToMultiByte(
71 		CP_UTF8, 0, ws.ptr, cast(int) ws.length, buf.ptr, needed, null, null);
72 	return cast(string) buf;
73 }
74 
75 // These roundtrip tests exercise fromWStringz, which is @system (it walks a
76 // raw pointer), so the blocks themselves cannot be @safe.
77 @system unittest
78 {
79 	// ASCII roundtrip.
80 	enum ascii = "Hello, world!";
81 	assert(ascii.toWStringz.fromWStringz == ascii);
82 }
83 
84 @system unittest
85 {
86 	// Cyrillic roundtrip.
87 	enum cyrillic = "Привет, мир!";
88 	assert(cyrillic.toWStringz.fromWStringz == cyrillic);
89 }
90 
91 @system unittest
92 {
93 	// Hebrew roundtrip (right-to-left script).
94 	enum hebrew = "שלום עולם";
95 	assert(hebrew.toWStringz.fromWStringz == hebrew);
96 }
97 
98 @system unittest
99 {
100 	// CJK roundtrip (BMP) plus an astral-plane code point exercising surrogate
101 	// pairs (emoji).
102 	enum cjk = "日本語テスト 🎉";
103 	assert(cjk.toWStringz.fromWStringz == cjk);
104 }
105 
106 @system unittest
107 {
108 	// Empty string roundtrips through both directions.
109 	assert("".toWStringz.fromWStringz == "");
110 	assert(fromWString([]) == "");
111 }
112 
113 @system unittest
114 {
115 	// fromWStringz stops at the first embedded NUL.
116 	const(wchar)[] withNul = ['a', 'b', '\0', 'c', 'd'];
117 	assert(fromWStringz(withNul.ptr) == "ab");
118 }
119 
120 @safe unittest
121 {
122 	import std.algorithm.searching : canFind;
123 	import std.utf : validate;
124 
125 	// A lone high surrogate is malformed UTF-16. Conversion must not throw, must
126 	// yield valid UTF-8, and must surface the replacement character (U+FFFD).
127 	const(wchar)[] lone = ['a', 0xD800, 'b'];
128 	string decoded = fromWString(lone);
129 	validate(decoded); // throws if not well-formed UTF-8
130 	assert(decoded.length >= 1);
131 	assert(decoded[0] == 'a');
132 	assert(decoded.canFind('�'));
133 }