1 /** 2 Utility functions for string processing 3 4 Copyright: © 2012-2014 RejectedSoftware e.K. 5 License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file. 6 Authors: Sönke Ludwig 7 */ 8 module vutil..string; 9 10 public import std.string; 11 12 import vutil.array; 13 import vutil.memory; 14 15 import std.algorithm; 16 import std.array; 17 import std.ascii; 18 import std.format; 19 import std.uni; 20 import std.utf; 21 import core.exception; 22 23 24 /** 25 Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to 26 the original as possible. 27 */ 28 string sanitizeUTF8(in ubyte[] str) 29 @safe pure { 30 import std.utf; 31 auto ret = appender!string(); 32 ret.reserve(str.length); 33 34 size_t i = 0; 35 while (i < str.length) { 36 dchar ch = str[i]; 37 try ch = std.utf.decode(cast(const(char[]))str, i); 38 catch( UTFException ){ i++; } 39 //catch( AssertError ){ i++; } 40 char[4] dst; 41 auto len = std.utf.encode(dst, ch); 42 ret.put(dst[0 .. len]); 43 } 44 45 return ret.data; 46 } 47 48 /** 49 Strips the byte order mark of an UTF8 encoded string. 50 This is useful when the string is coming from a file. 51 */ 52 string stripUTF8Bom(string str) 53 @safe pure nothrow { 54 if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF]) 55 return str[3 ..$]; 56 return str; 57 } 58 59 60 /** 61 Checks if all characters in 'str' are contained in 'chars'. 62 */ 63 bool allOf(string str, string chars) 64 @safe pure { 65 foreach (dchar ch; str) 66 if (!chars.canFind(ch)) 67 return false; 68 return true; 69 } 70 71 ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes) 72 @safe pure { 73 if (__ctfe) { 74 if (cs == CaseSensitive.yes) { 75 foreach (i, dchar ch; s) 76 if (ch == c) 77 return i; 78 } else { 79 c = std.uni.toLower(c); 80 foreach (i, dchar ch; s) 81 if (std.uni.toLower(ch) == c) 82 return i; 83 } 84 return -1; 85 } else return std..string.indexOf(s, c, cs); 86 } 87 88 /** 89 Checks if any character in 'str' is contained in 'chars'. 90 */ 91 bool anyOf(string str, string chars) 92 @safe pure { 93 foreach (ch; str) 94 if (chars.canFind(ch)) 95 return true; 96 return false; 97 } 98 99 100 /// ASCII whitespace trimming (space and tab) 101 string stripLeftA(string s) 102 @safe pure nothrow { 103 while (s.length > 0 && (s[0] == ' ' || s[0] == '\t')) 104 s = s[1 .. $]; 105 return s; 106 } 107 108 /// ASCII whitespace trimming (space and tab) 109 string stripRightA(string s) 110 @safe pure nothrow { 111 while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t')) 112 s = s[0 .. $-1]; 113 return s; 114 } 115 116 /// ASCII whitespace trimming (space and tab) 117 string stripA(string s) 118 @safe pure nothrow { 119 return stripLeftA(stripRightA(s)); 120 } 121 122 /// Finds the first occurence of any of the characters in `chars` 123 sizediff_t indexOfAny(string str, string chars) 124 @safe pure { 125 foreach (i, char ch; str) 126 if (chars.canFind(ch)) 127 return i; 128 return -1; 129 } 130 alias countUntilAny = indexOfAny; 131 132 /** 133 Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{'). 134 135 Params: 136 str = input string 137 nested = whether to skip nested brackets 138 Returns: 139 The index of the closing bracket or -1 for unbalanced strings 140 and strings that don't start with a bracket. 141 */ 142 sizediff_t matchBracket(string str, bool nested = true) 143 @safe pure nothrow { 144 if (str.length < 2) return -1; 145 146 char open = str[0], close = void; 147 switch (str[0]) { 148 case '[': close = ']'; break; 149 case '(': close = ')'; break; 150 case '<': close = '>'; break; 151 case '{': close = '}'; break; 152 default: return -1; 153 } 154 155 size_t level = 1; 156 foreach (i, char c; str[1 .. $]) { 157 if (nested && c == open) ++level; 158 else if (c == close) --level; 159 if (level == 0) return i + 1; 160 } 161 return -1; 162 } 163 164 @safe unittest 165 { 166 static struct Test { string str; sizediff_t res; } 167 enum tests = [ 168 Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4), 169 Test("[", -1), Test("[foo", -1), Test("ab[f]", -1), 170 Test("[foo[bar]]", 9), Test("[foo{bar]]", 8), 171 ]; 172 foreach (test; tests) 173 assert(matchBracket(test.str) == test.res); 174 assert(matchBracket("[foo[bar]]", false) == 8); 175 static assert(matchBracket("[foo]") == 4); 176 } 177 178 /// Same as std.string.format, just using an allocator. 179 string formatAlloc(ARGS...)(Allocator alloc, string fmt, ARGS args) 180 { 181 auto app = AllocAppender!string(alloc); 182 formattedWrite(&app, fmt, args); 183 return app.data; 184 } 185 186 /// Special version of icmp() with optimization for ASCII characters 187 int icmp2(string a, string b) 188 @safe pure { 189 size_t i = 0, j = 0; 190 191 // fast skip equal prefix 192 size_t min_len = min(a.length, b.length); 193 while( i < min_len && a[i] == b[i] ) i++; 194 if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence 195 j = i; 196 197 // compare the differing character and the rest of the string 198 while(i < a.length && j < b.length){ 199 uint ac = cast(uint)a[i]; 200 uint bc = cast(uint)b[j]; 201 if( !((ac | bc) & 0x80) ){ 202 i++; 203 j++; 204 if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A'; 205 if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A'; 206 if( ac < bc ) return -1; 207 else if( ac > bc ) return 1; 208 } else { 209 dchar acp = decode(a, i); 210 dchar bcp = decode(b, j); 211 if( acp != bcp ){ 212 acp = std.uni.toLower(acp); 213 bcp = std.uni.toLower(bcp); 214 if( acp < bcp ) return -1; 215 else if( acp > bcp ) return 1; 216 } 217 } 218 } 219 220 if( i < a.length ) return 1; 221 else if( j < b.length ) return -1; 222 223 assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?"); 224 return 0; 225 }