1 /**
2 	Utility functions for string processing
3 
4 	Copyright: © 2012-2014 RejectedSoftware e.K.
5 	License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
6 	Authors: Sönke Ludwig
7 */
8 module vutil..string;
9 
10 public import std.string;
11 
12 import vutil.array;
13 import vutil.memory;
14 
15 import std.algorithm;
16 import std.array;
17 import std.ascii;
18 import std.format;
19 import std.uni;
20 import std.utf;
21 import core.exception;
22 
23 
24 /**
25 	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
26 	the original as possible.
27 */
28 string sanitizeUTF8(in ubyte[] str)
29 @safe pure {
30 	import std.utf;
31 	auto ret = appender!string();
32 	ret.reserve(str.length);
33 
34 	size_t i = 0;
35 	while (i < str.length) {
36 		dchar ch = str[i];
37 		try ch = std.utf.decode(cast(const(char[]))str, i);
38 		catch( UTFException ){ i++; }
39 		//catch( AssertError ){ i++; }
40 		char[4] dst;
41 		auto len = std.utf.encode(dst, ch);
42 		ret.put(dst[0 .. len]);
43 	}
44 
45 	return ret.data;
46 }
47 
48 /**
49 	Strips the byte order mark of an UTF8 encoded string.
50 	This is useful when the string is coming from a file.
51 */
52 string stripUTF8Bom(string str)
53 @safe pure nothrow {
54 	if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
55 		return str[3 ..$];
56 	return str;
57 }
58 
59 
60 /**
61 	Checks if all characters in 'str' are contained in 'chars'.
62 */
63 bool allOf(string str, string chars)
64 @safe pure {
65 	foreach (dchar ch; str)
66 		if (!chars.canFind(ch))
67 			return false;
68 	return true;
69 }
70 
71 ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)
72 @safe pure {
73 	if (__ctfe) {
74 		if (cs == CaseSensitive.yes) {
75 			foreach (i, dchar ch; s)
76 				if (ch == c)
77 					return i;
78 		} else {
79 			c = std.uni.toLower(c);
80 			foreach (i, dchar ch; s)
81 				if (std.uni.toLower(ch) == c)
82 					return i;
83 		}
84 		return -1;
85 	} else return std..string.indexOf(s, c, cs);
86 }
87 
88 /**
89 	Checks if any character in 'str' is contained in 'chars'.
90 */
91 bool anyOf(string str, string chars)
92 @safe pure {
93 	foreach (ch; str)
94 		if (chars.canFind(ch))
95 			return true;
96 	return false;
97 }
98 
99 
100 /// ASCII whitespace trimming (space and tab)
101 string stripLeftA(string s)
102 @safe pure nothrow {
103 	while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
104 		s = s[1 .. $];
105 	return s;
106 }
107 
108 /// ASCII whitespace trimming (space and tab)
109 string stripRightA(string s)
110 @safe pure nothrow {
111 	while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
112 		s = s[0 .. $-1];
113 	return s;
114 }
115 
116 /// ASCII whitespace trimming (space and tab)
117 string stripA(string s)
118 @safe pure nothrow {
119 	return stripLeftA(stripRightA(s));
120 }
121 
122 /// Finds the first occurence of any of the characters in `chars`
123 sizediff_t indexOfAny(string str, string chars)
124 @safe pure {
125 	foreach (i, char ch; str)
126 		if (chars.canFind(ch))
127 			return i;
128 	return -1;
129 }
130 alias countUntilAny = indexOfAny;
131 
132 /**
133 	Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
134 
135 	Params:
136 		str = input string
137 		nested = whether to skip nested brackets
138 	Returns:
139 		The index of the closing bracket or -1 for unbalanced strings
140 		and strings that don't start with a bracket.
141 */
142 sizediff_t matchBracket(string str, bool nested = true)
143 @safe pure nothrow {
144 	if (str.length < 2) return -1;
145 
146 	char open = str[0], close = void;
147 	switch (str[0]) {
148 		case '[': close = ']'; break;
149 		case '(': close = ')'; break;
150 		case '<': close = '>'; break;
151 		case '{': close = '}'; break;
152 		default: return -1;
153 	}
154 
155 	size_t level = 1;
156 	foreach (i, char c; str[1 .. $]) {
157 		if (nested && c == open) ++level;
158 		else if (c == close) --level;
159 		if (level == 0) return i + 1;
160 	}
161 	return -1;
162 }
163 
164 @safe unittest
165 {
166     static struct Test { string str; sizediff_t res; }
167     enum tests = [
168         Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
169         Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
170         Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
171     ];
172     foreach (test; tests)
173         assert(matchBracket(test.str) == test.res);
174     assert(matchBracket("[foo[bar]]", false) == 8);
175     static assert(matchBracket("[foo]") == 4);
176 }
177 
178 /// Same as std.string.format, just using an allocator.
179 string formatAlloc(ARGS...)(Allocator alloc, string fmt, ARGS args)
180 {
181 	auto app = AllocAppender!string(alloc);
182 	formattedWrite(&app, fmt, args);
183 	return app.data;
184 }
185 
186 /// Special version of icmp() with optimization for ASCII characters
187 int icmp2(string a, string b)
188 @safe pure {
189 	size_t i = 0, j = 0;
190 	
191 	// fast skip equal prefix
192 	size_t min_len = min(a.length, b.length);
193 	while( i < min_len && a[i] == b[i] ) i++;
194 	if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
195 	j = i;
196 
197 	// compare the differing character and the rest of the string
198 	while(i < a.length && j < b.length){
199 		uint ac = cast(uint)a[i];
200 		uint bc = cast(uint)b[j];
201 		if( !((ac | bc) & 0x80) ){
202 			i++;
203 			j++;
204 			if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
205 			if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
206 			if( ac < bc ) return -1;
207 			else if( ac > bc ) return 1;
208 		} else {
209 			dchar acp = decode(a, i);
210 			dchar bcp = decode(b, j);
211 			if( acp != bcp ){
212 				acp = std.uni.toLower(acp);
213 				bcp = std.uni.toLower(bcp);
214 				if( acp < bcp ) return -1;
215 				else if( acp > bcp ) return 1;
216 			}
217 		}
218 	}
219 
220 	if( i < a.length ) return 1;
221 	else if( j < b.length ) return -1;
222 
223 	assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
224 	return 0;
225 }