1 module served.lsp.textdocumentmanager;
2 
3 import std.algorithm;
4 import std.experimental.logger;
5 import std.json;
6 import std..string;
7 import std.utf : codeLength, decode, UseReplacementDchar;
8 
9 import served.lsp.jsonrpc;
10 import served.lsp.protocol;
11 
12 import painlessjson;
13 
14 /// in-memory representation of a file at any given URI. Not thread-safe.
15 struct Document
16 {
17 	/// The URI of this document. Should not be changed.
18 	DocumentUri uri;
19 	/// The language ID as reported by the client. Should not be changed.
20 	string languageId;
21 	/// The document version as reported by the client. Should not be changed.
22 	long version_;
23 	private char[] text;
24 
25 	string getLanguageId() const @property @trusted @nogc nothrow
26 	{
27 		if (!languageId.length)
28 		{
29 			import std.path : extension;
30 			import std.uni : sicmp;
31 
32 			const ext = uri.extension;
33 			if (ext.sicmp(".d") == 0)
34 				return "d";
35 			else if (ext.sicmp(".dpp") == 0)
36 				return "dpp";
37 			else if (ext.sicmp(".ds") == 0 || ext.sicmp(".dscript") == 0)
38 				return "dscript";
39 			else if (ext.sicmp(".dml") == 0)
40 				return "dml";
41 			else if (ext.sicmp(".sdl") == 0)
42 				return "sdl";
43 			else if (ext.sicmp(".dt") == 0)
44 				return "diet";
45 			else
46 				return null;
47 		}
48 
49 		return languageId;
50 	}
51 
52 	/// Creates a new D document at the given document URI, with version 0 and
53 	/// no text.
54 	this(DocumentUri uri)
55 	{
56 		this.uri = uri;
57 		languageId = "d";
58 		version_ = 0;
59 		text = null;
60 	}
61 
62 	/// Creates a new document at the given document URI, with the given version
63 	/// and language and creates a copy of the text to use.
64 	this(TextDocumentItem doc)
65 	{
66 		uri = doc.uri;
67 		languageId = doc.languageId;
68 		version_ = doc.version_;
69 		text = doc.text.dup;
70 	}
71 
72 	/// Creates a document with no URI and no language ID and copies the content
73 	/// into the text buffer using $(LREF setContent).
74 	static Document nullDocument(scope const(char)[] content)
75 	{
76 		Document ret;
77 		ret.setContent(content);
78 		return ret;
79 	}
80 
81 	immutable(Document) clone()
82 	{
83 		Document ret = this;
84 		ret.text = text.dup;
85 		return cast(immutable) ret;
86 	}
87 
88 	version (unittest) private static Document nullDocumentOwnMemory(char[] content)
89 	{
90 		Document ret;
91 		ret.text = content;
92 		return ret;
93 	}
94 
95 	/// Returns a read-only view of the text. The text may however be changed
96 	/// by other operations, so this slice should be used directly and not after
97 	/// any context yield or API call potentially modifying the data.
98 	const(char)[] rawText() const
99 	{
100 		return cast(const(char)[]) text;
101 	}
102 
103 	string rawText() immutable
104 	{
105 		return text;
106 	}
107 
108 	///
109 	size_t length() const @property
110 	{
111 		return text.length;
112 	}
113 
114 	/// Sets the content of this document to the given content. Copies the data
115 	/// from newContent into this text buffer.
116 	///
117 	/// Should not be called as an API unless managing some kind of virtual
118 	/// document manually.
119 	void setContent(scope const(char)[] newContent)
120 	{
121 		if (newContent.length <= text.length)
122 		{
123 			text[0 .. newContent.length] = newContent;
124 			text.length = newContent.length;
125 		}
126 		else
127 		{
128 			text = text.assumeSafeAppend;
129 			text.length = newContent.length;
130 			text = text.assumeSafeAppend;
131 			text[0 .. $] = newContent;
132 		}
133 	}
134 
135 	///
136 	void applyChange(TextRange range, scope const(char)[] newContent)
137 	{
138 		auto start = positionToBytes(range[0]);
139 		auto end = positionToBytes(range[1]);
140 
141 		if (start > end)
142 			swap(start, end);
143 
144 		if (start == 0 && end == text.length)
145 		{
146 			setContent(newContent);
147 			return;
148 		}
149 
150 		auto addition = newContent.representation;
151 		int removed = cast(int) end - cast(int) start;
152 		int added = cast(int) addition.length - removed;
153 		text = text.assumeSafeAppend;
154 		if (added > 0)
155 		{
156 			text.length += added;
157 			// text[end + added .. $] = text[end .. $ - added];
158 			for (int i = cast(int) text.length - 1; i >= end + added; i--)
159 				text[i] = text[i - added];
160 		}
161 		else if (added < 0)
162 		{
163 			for (size_t i = start; i < text.length + added; i++)
164 				text[i] = text[i - added];
165 
166 			text = text[0 .. $ + added];
167 		}
168 		text = text.assumeSafeAppend;
169 
170 		foreach (i, c; addition)
171 			text[start + i] = cast(char) c;
172 	}
173 
174 	/// Converts an LSP offset to a byte offset for using for example in array
175 	/// slicing.
176 	size_t offsetToBytes(size_t offset) const
177 	{
178 		return .countBytesUntilUTF16Index(text, offset);
179 	}
180 
181 	/// Converts a byte offset to an LSP offset.
182 	size_t bytesToOffset(size_t bytes) const
183 	{
184 		return .countUTF16Length(text[0 .. min($, bytes)]);
185 	}
186 
187 	/// Converts a line/column position to an LSP offset.
188 	size_t positionToOffset(Position position) const
189 	{
190 		size_t offset = 0;
191 		size_t bytes = 0;
192 		while (bytes < text.length && position.line > 0)
193 		{
194 			const c = text.ptr[bytes];
195 			if (c == '\n')
196 				position.line--;
197 			utf16DecodeUtf8Length(c, offset, bytes);
198 		}
199 
200 		while (bytes < text.length && position.character > 0)
201 		{
202 			const c = text.ptr[bytes];
203 			if (c == '\n')
204 				break;
205 			size_t utf16Size;
206 			utf16DecodeUtf8Length(c, utf16Size, bytes);
207 			if (utf16Size < position.character)
208 				position.character -= utf16Size;
209 			else
210 				position.character = 0;
211 			offset += utf16Size;
212 		}
213 		return offset;
214 	}
215 
216 	/// Converts a line/column position to a byte offset.
217 	size_t positionToBytes(Position position) const
218 	{
219 		size_t index = 0;
220 		while (index < text.length && position.line > 0)
221 			if (text.ptr[index++] == '\n')
222 				position.line--;
223 
224 		while (index < text.length && position.character > 0)
225 		{
226 			const c = text.ptr[index];
227 			if (c == '\n')
228 				break;
229 			size_t utf16Size;
230 			utf16DecodeUtf8Length(c, utf16Size, index);
231 			if (utf16Size < position.character)
232 				position.character -= utf16Size;
233 			else
234 				position.character = 0;
235 		}
236 		return index;
237 	}
238 
239 	/// Converts an LSP offset to a line/column position.
240 	Position offsetToPosition(size_t offset) const
241 	{
242 		size_t bytes;
243 		size_t index;
244 		size_t lastNl = -1;
245 
246 		Position ret;
247 		while (bytes < text.length && index < offset)
248 		{
249 			const c = text.ptr[bytes];
250 			if (c == '\n')
251 			{
252 				ret.line++;
253 				lastNl = index;
254 			}
255 			utf16DecodeUtf8Length(c, index, bytes);
256 		}
257 		const start = lastNl + 1;
258 		ret.character = cast(uint)(index - start);
259 		return ret;
260 	}
261 
262 	/// Converts a byte offset to a line/column position.
263 	Position bytesToPosition(size_t bytes) const
264 	{
265 		if (bytes > text.length)
266 			bytes = text.length;
267 		auto part = text.ptr[0 .. bytes].representation;
268 		size_t lastNl = -1;
269 		Position ret;
270 		foreach (i; 0 .. bytes)
271 		{
272 			if (part.ptr[i] == '\n')
273 			{
274 				ret.line++;
275 				lastNl = i;
276 			}
277 		}
278 		ret.character = cast(uint)(cast(const(char)[]) part[lastNl + 1 .. $]).countUTF16Length;
279 		return ret;
280 	}
281 
282 	/// Converts a line/column byte offset to a line/column position.
283 	Position lineColumnBytesToPosition(uint line, uint column) const
284 	{
285 		scope lineText = lineAtScope(line);
286 		uint offset = 0;
287 		// keep over-extending positions
288 		if (column > lineText.length)
289 		{
290 			offset = column - cast(uint)lineText.length;
291 			column -= offset;
292 			assert(column <= lineText.length);
293 		}
294 		return Position(line, cast(uint) lineText[0 .. column].countUTF16Length + offset);
295 	}
296 
297 	/// Returns the position at "end" starting from the given "src" position which is assumed to be at byte "start"
298 	/// Faster to quickly calculate nearby positions of known byte positions.
299 	/// Falls back to $(LREF bytesToPosition) if end is before start.
300 	Position movePositionBytes(Position src, size_t start, size_t end) const
301 	{
302 		if (end == start)
303 			return src;
304 		if (end < start)
305 			return bytesToPosition(end);
306 
307 		auto t = text[min($, start) .. min($, end)];
308 		size_t bytes;
309 		while (bytes < t.length)
310 		{
311 			const c = t.ptr[bytes];
312 			if (c == '\n')
313 			{
314 				src.line++;
315 				src.character = 0;
316 				bytes++;
317 			}
318 			else
319 				utf16DecodeUtf8Length(c, src.character, bytes);
320 		}
321 		return src;
322 	}
323 
324 	Position nextPositionBytes(ref Position src, ref size_t start, size_t end) const
325 	{
326 		auto pos = movePositionBytes(src, start, end);
327 		src = pos;
328 		start = end;
329 		return pos;
330 	}
331 
332 	/// Returns the word range at a given line/column position.
333 	TextRange wordRangeAt(Position position) const
334 	{
335 		auto chars = wordInLine(lineAtScope(position), position.character);
336 		return TextRange(Position(position.line, chars[0]), Position(position.line, chars[1]));
337 	}
338 
339 	/// Returns the word range at a given byte position.
340 	size_t[2] wordRangeAt(size_t bytes) const
341 	{
342 		auto lineStart = text.lastIndexOf('\n', bytes) + 1;
343 		auto ret = wordInLineBytes(text[lineStart .. $], cast(uint)(bytes - lineStart));
344 		ret[0] += lineStart;
345 		ret[1] += lineStart;
346 		return ret;
347 	}
348 
349 	/// Returns a byte offset range as `[start, end]` of the given 0-based line
350 	/// number.
351 	size_t[2] lineByteRangeAt(uint line) const
352 	{
353 		size_t start = 0;
354 		size_t index = 0;
355 		while (line > 0 && index < text.length)
356 		{
357 			const c = text.ptr[index++];
358 			if (c == '\n')
359 			{
360 				line--;
361 				start = index;
362 			}
363 		}
364 		// if !found
365 		if (line != 0)
366 			return [0, 0];
367 
368 		auto end = text.indexOf('\n', start);
369 		if (end == -1)
370 			end = text.length;
371 		else
372 			end++;
373 
374 		return [start, end];
375 	}
376 
377 	/// Returns the text of a line at the given position.
378 	string lineAt(Position position) const
379 	{
380 		return lineAt(position.line);
381 	}
382 
383 	/// Returns the text of a line starting at line 0.
384 	string lineAt(uint line) const
385 	{
386 		return lineAtScope(line).idup;
387 	}
388 
389 	/// Returns the line text which is only in this scope if text isn't modified
390 	/// See_Also: $(LREF lineAt)
391 	scope const(char)[] lineAtScope(Position position) const
392 	{
393 		return lineAtScope(position.line);
394 	}
395 
396 	/// Returns the line text which is only in this scope if text isn't modified
397 	/// See_Also: $(LREF lineAt)
398 	scope const(char)[] lineAtScope(uint line) const
399 	{
400 		auto range = lineByteRangeAt(line);
401 		return text[range[0] .. range[1]];
402 	}
403 
404 	unittest
405 	{
406 		void assertEqual(A, B)(A a, B b)
407 		{
408 			import std.conv : to;
409 
410 			assert(a == b, a.to!string ~ " is not equal to " ~ b.to!string);
411 		}
412 
413 		Document doc;
414 		doc.setContent(`abc
415 hellö world
416 how åre
417 you?`);
418 		assertEqual(doc.lineAt(Position(0, 0)), "abc\n");
419 		assertEqual(doc.lineAt(Position(0, 100)), "abc\n");
420 		assertEqual(doc.lineAt(Position(1, 3)), "hellö world\n");
421 		assertEqual(doc.lineAt(Position(2, 0)), "how åre\n");
422 		assertEqual(doc.lineAt(Position(3, 0)), "you?");
423 		assertEqual(doc.lineAt(Position(3, 8)), "you?");
424 		assertEqual(doc.lineAt(Position(4, 0)), "");
425 	}
426 
427 	/// Returns how a line is terminated at the given 0-based line number.
428 	EolType eolAt(int line) const
429 	{
430 		size_t index = 0;
431 		int curLine = 0;
432 		bool prevWasCr = false;
433 		while (index < text.length)
434 		{
435 			if (curLine > line)
436 				return EolType.lf;
437 			auto c = decode!(UseReplacementDchar.yes)(text, index);
438 			if (c == '\n')
439 			{
440 				if (curLine == line)
441 				{
442 					return prevWasCr ? EolType.crlf : EolType.lf;
443 				}
444 				curLine++;
445 			}
446 			prevWasCr = c == '\r';
447 		}
448 		return EolType.lf;
449 	}
450 }
451 
452 /// Helper struct which should have one unique instance in the application which
453 /// processes document events sent by a LSP client to an LSP server and creates
454 /// an in-memory representation of all the files managed by the client.
455 struct TextDocumentManager
456 {
457 	/// Internal document storage. Only iterate over this using `foreach`, other
458 	/// operations are not considered officially supported.
459 	Document[] documentStore;
460 
461 	/// Same as $(LREF tryGet) but throws an exception if the URI doesn't exist.
462 	ref Document opIndex(string uri)
463 	{
464 		auto idx = documentStore.countUntil!(a => a.uri == uri);
465 		if (idx == -1)
466 			throw new Exception("Document '" ~ uri ~ "' not found");
467 		return documentStore[idx];
468 	}
469 
470 	/// Tries to get a document from a URI, returns Document.init if it is not
471 	/// in the in-memory cache / not sent by the client.
472 	Document tryGet(string uri)
473 	{
474 		auto idx = documentStore.countUntil!(a => a.uri == uri);
475 		if (idx == -1)
476 			return Document.init;
477 		return documentStore[idx];
478 	}
479 
480 	/// Tries to load a given URI manually without having it received via LSP
481 	/// methods. Note that a LSP close method will unload this early.
482 	/// Returns: the created document
483 	/// Throws: FileException in case the file doesn't exist or other file
484 	///         system errors. In this case no new document should have been
485 	///         inserted yet.
486 	ref Document loadFromFilesystem(string uri)
487 	{
488 		import served.lsp.uri : uriToFile;
489 		import fs = std.file;
490 
491 		string path = uriToFile(uri);
492 		auto content = fs.readText(path);
493 
494 		auto index = documentStore.length++;
495 		documentStore[index].uri = uri;
496 		documentStore[index].version_ = -1;
497 		documentStore[index].setContent(content);
498 		return documentStore[index];
499 	}
500 
501 	/// Unloads the given URI so it's no longer accessible. Note that this
502 	/// should only be done for documents loaded manually and never for LSP
503 	/// documents as it will break all features in that file until reopened.
504 	bool unloadDocument(string uri)
505 	{
506 		auto idx = documentStore.countUntil!(a => a.uri == uri);
507 		if (idx == -1)
508 			return false;
509 
510 		documentStore[idx] = documentStore[$ - 1];
511 		documentStore.length--;
512 		return true;
513 	}
514 
515 	/// Returns the currently preferred syncKind to use with the client.
516 	/// Additionally always supports the `full` sync kind.
517 	static TextDocumentSyncKind syncKind()
518 	{
519 		return TextDocumentSyncKind.incremental;
520 	}
521 
522 	/// Processes an LSP packet and performs the document update in-memory that
523 	/// is requested.
524 	/// Params:
525 	///   msg = The request sent by a client. This method only processes
526 	///     `textDocument/` messages which are relevant to file modification.
527 	/// Returns: `true` if the given method was handled, `false` otherwise.
528 	bool process(RequestMessage msg)
529 	{
530 		if (msg.method == "textDocument/didOpen")
531 		{
532 			auto params = msg.params.fromJSON!DidOpenTextDocumentParams;
533 			documentStore ~= Document(params.textDocument);
534 			return true;
535 		}
536 		else if (msg.method == "textDocument/didClose")
537 		{
538 			auto targetUri = msg.params["textDocument"]["uri"].str;
539 			if (!unloadDocument(targetUri))
540 			{
541 				warning("Received didClose notification for URI not in system: ", targetUri);
542 				warning("This can be a potential memory leak if it was previously opened under a different name.");
543 			}
544 			return true;
545 		}
546 		else if (msg.method == "textDocument/didChange")
547 		{
548 			auto targetUri = msg.params["textDocument"]["uri"].str;
549 			auto idx = documentStore.countUntil!(a => a.uri == targetUri);
550 			if (idx >= 0)
551 			{
552 				documentStore[idx].version_ = msg.params["textDocument"]["version"].integer;
553 				foreach (change; msg.params["contentChanges"].array)
554 				{
555 					if (auto rangePtr = "range" in change)
556 					{
557 						auto range = *rangePtr;
558 						TextRange textRange = cast(Position[2])[
559 							range["start"].fromJSON!Position, range["end"].fromJSON!Position
560 						];
561 						documentStore[idx].applyChange(textRange, change["text"].str);
562 					}
563 					else
564 						documentStore[idx].setContent(change["text"].str);
565 				}
566 			}
567 			return true;
568 		}
569 		return false;
570 	}
571 }
572 
573 /// Helper structure for storing any data of type T on a per-file basis.
574 struct PerDocumentCache(T)
575 {
576 	struct Entry
577 	{
578 		Document document;
579 		T data;
580 	}
581 
582 	Entry[] entries;
583 
584 	T cached(ref TextDocumentManager source, string uri)
585 	{
586 		auto newest = source.tryGet(uri);
587 		foreach (entry; entries)
588 			if (entry.document.uri == uri)
589 			{
590 				if (entry.document.version_ >= newest.version_)
591 					return entry.data;
592 				else
593 					return T.init;
594 			}
595 		return T.init;
596 	}
597 
598 	void store(Document document, T data)
599 	{
600 		foreach (ref entry; entries)
601 		{
602 			if (entry.document.uri == document.uri)
603 			{
604 				if (document.version_ >= entry.document.version_)
605 				{
606 					entry.document = document;
607 					entry.data = data;
608 				}
609 				return;
610 			}
611 		}
612 		entries ~= Entry(document, data);
613 	}
614 }
615 
616 /// Returns a range of the identifier/word at the given position.
617 uint[2] wordInLine(const(char)[] line, uint character)
618 {
619 	return wordInLineImpl!(wchar, uint)(line, character);
620 }
621 
622 /// ditto
623 size_t[2] wordInLineBytes(const(char)[] line, size_t bytes)
624 {
625 	return wordInLineImpl!(char, size_t)(line, bytes);
626 }
627 
628 SizeT[2] wordInLineImpl(CharT, SizeT)(const(char)[] line, SizeT character)
629 {
630 	size_t index = 0;
631 	SizeT offs = 0;
632 
633 	SizeT lastStart = character;
634 	SizeT start = character, end = character + 1;
635 	bool searchStart = true;
636 
637 	while (index < line.length)
638 	{
639 		const c = decode(line, index);
640 		const l = cast(SizeT) c.codeLength!CharT;
641 
642 		if (searchStart)
643 		{
644 			if (isDIdentifierSeparatingChar(c))
645 				lastStart = offs + l;
646 
647 			if (offs + l >= character)
648 			{
649 				start = lastStart;
650 				searchStart = false;
651 			}
652 
653 			offs += l;
654 		}
655 		else
656 		{
657 			end = offs;
658 			offs += l;
659 			if (isDIdentifierSeparatingChar(c))
660 				break;
661 		}
662 	}
663 
664 	if (start > line.length)
665 		start = cast(SizeT)line.length;
666 	if (end > line.length)
667 		end = cast(SizeT)line.length;
668 	if (end < start)
669 		end = start;
670 
671 	return [start, end];
672 }
673 
674 deprecated("use isDIdentifierSeparatingChar instead")
675 alias isIdentifierSeparatingChar = isDIdentifierSeparatingChar;
676 
677 ///
678 bool isDIdentifierSeparatingChar(dchar c)
679 {
680 	return c < 48 || (c > 57 && c < 65) || c == '[' || c == '\\' || c == ']'
681 		|| c == '`' || (c > 122 && c < 128) || c == '\u2028' || c == '\u2029'; // line separators
682 }
683 
684 ///
685 bool isValidDIdentifier(const(char)[] s)
686 {
687 	import std.ascii : isDigit;
688 
689 	return s.length && !s[0].isDigit && !s.any!isDIdentifierSeparatingChar;
690 }
691 
692 unittest
693 {
694 	assert(!isValidDIdentifier(""));
695 	assert(!isValidDIdentifier("0"));
696 	assert(!isValidDIdentifier("10"));
697 	assert(!isValidDIdentifier("1a"));
698 	assert(isValidDIdentifier("_"));
699 	assert(isValidDIdentifier("a"));
700 	assert(isValidDIdentifier("__helloWorld123"));
701 }
702 
703 unittest
704 {
705 	Document doc;
706 	doc.text.reserve(16);
707 	auto ptr = doc.text.ptr;
708 	assert(doc.rawText.length == 0);
709 	doc.setContent("Hello world");
710 	assert(doc.rawText == "Hello world");
711 	doc.setContent("foo");
712 	assert(doc.rawText == "foo");
713 	doc.setContent("foo bar baz baf");
714 	assert(doc.rawText == "foo bar baz baf");
715 	doc.applyChange(TextRange(0, 4, 0, 8), "");
716 	assert(doc.rawText == "foo baz baf");
717 	doc.applyChange(TextRange(0, 4, 0, 8), "bad");
718 	assert(doc.rawText == "foo badbaf");
719 	doc.applyChange(TextRange(0, 4, 0, 8), "bath");
720 	assert(doc.rawText == "foo bathaf");
721 	doc.applyChange(TextRange(0, 4, 0, 10), "bath");
722 	assert(doc.rawText == "foo bath");
723 	doc.applyChange(TextRange(0, 0, 0, 8), "bath");
724 	assert(doc.rawText == "bath");
725 	doc.applyChange(TextRange(0, 0, 0, 1), "par");
726 	assert(doc.rawText == "parath", doc.rawText);
727 	doc.applyChange(TextRange(0, 0, 0, 4), "");
728 	assert(doc.rawText == "th");
729 	doc.applyChange(TextRange(0, 2, 0, 2), "e");
730 	assert(doc.rawText == "the");
731 	doc.applyChange(TextRange(0, 0, 0, 0), "in");
732 	assert(doc.rawText == "inthe");
733 	assert(ptr is doc.text.ptr);
734 }
735 
736 pragma(inline, true) private void utf16DecodeUtf8Length(A, B)(char c, ref A utf16Index,
737 		ref B utf8Index) @safe nothrow @nogc
738 {
739 	switch (c & 0b1111_0000)
740 	{
741 	case 0b1110_0000:
742 		// assume valid encoding (no wrong surrogates)
743 		utf16Index++;
744 		utf8Index += 3;
745 		break;
746 	case 0b1111_0000:
747 		utf16Index += 2;
748 		utf8Index += 4;
749 		break;
750 	case 0b1100_0000:
751 	case 0b1101_0000:
752 		utf16Index++;
753 		utf8Index += 2;
754 		break;
755 	default:
756 		utf16Index++;
757 		utf8Index++;
758 		break;
759 	}
760 }
761 
762 pragma(inline, true) size_t countUTF16Length(scope const(char)[] text) @safe nothrow @nogc
763 {
764 	size_t offset;
765 	size_t index;
766 	while (index < text.length)
767 	{
768 		const c = (() @trusted => text.ptr[index++])();
769 		if (cast(byte)c >= -0x40) offset++;
770 		if (c >= 0xf0) offset++;
771 	}
772 	return offset;
773 }
774 
775 pragma(inline, true) size_t countBytesUntilUTF16Index(scope const(char)[] text, size_t utf16Offset) @safe nothrow @nogc
776 {
777 	size_t bytes;
778 	size_t offset;
779 	while (offset < utf16Offset && bytes < text.length)
780 	{
781 		char c = (() @trusted => text.ptr[bytes++])();
782 		if (cast(byte)c >= -0x40) offset++;
783 		if (c >= 0xf0) offset++;
784 	}
785 	while (bytes < text.length)
786 	{
787 		char c = (() @trusted => text.ptr[bytes])();
788 		if (cast(byte)c >= -0x40) break;
789 		bytes++;
790 	}
791 	return bytes;
792 }
793 
794 version (unittest)
795 {
796 	import core.time;
797 
798 	Document testUnicodeDocument = Document.nullDocumentOwnMemory(cast(char[]) `///
799 /// Copyright © 2020 Somebody (not actually™) x3
800 ///
801 module some.file;
802 
803 enum Food : int
804 {
805 	pizza = '\U0001F355', // 🍕
806 	burger = '\U0001F354', // 🍔
807 	chicken = '\U0001F357', // 🍗
808 	taco = '\U0001F32E', // 🌮
809 	wrap = '\U0001F32F', // 🌯
810 	salad = '\U0001F957', // 🥗
811 	pasta = '\U0001F35D', // 🍝
812 	sushi = '\U0001F363', // 🍣
813 	oden = '\U0001F362', // 🍢
814 	egg = '\U0001F373', // 🍳
815 	croissant = '\U0001F950', // 🥐
816 	baguette = '\U0001F956', // 🥖
817 	popcorn = '\U0001F37F', // 🍿
818 	coffee = '\u2615', // ☕
819 	cookie = '\U0001F36A', // 🍪
820 }
821 
822 void main() {
823 	// taken from https://github.com/DlangRen/Programming-in-D/blob/master/ddili/src/ders/d.cn/aa.d
824 	int[string] colorCodes = [ /* ... */ ];
825 
826 	if ("purple" in colorCodes) {
827 		// ü®™🍳键 “purple” 在表中
828 
829 	} else { // line 31
830 		//表中不存在 键 “purple” 
831 	}
832 
833 	string x;
834 }`);
835 
836 	enum testSOF_byte = 0;
837 	enum testSOF_offset = 0;
838 	enum testSOF_position = Position(0, 0);
839 
840 	enum testEOF_byte = 872;
841 	enum testEOF_offset = 805;
842 	enum testEOF_position = Position(36, 1);
843 
844 	// in line before unicode
845 	enum testLinePreUni_byte = 757;
846 	enum testLinePreUni_offset = 724;
847 	enum testLinePreUni_position = Position(29, 4); // after `//`
848 
849 	// in line after unicode
850 	enum testLinePostUni_byte = 789;
851 	enum testLinePostUni_offset = 742;
852 	enum testLinePostUni_position = Position(29, 22); // after `purple” 在`
853 
854 	// ascii line after unicode line
855 	enum testMidAsciiLine_byte = 804;
856 	enum testMidAsciiLine_offset = 753;
857 	enum testMidAsciiLine_position = Position(31, 7);
858 
859 	@("{offset, bytes, position} -> {offset, bytes, position}")
860 	unittest
861 	{
862 		import std.conv;
863 		import std.stdio;
864 
865 		static foreach (test; [
866 				"SOF", "EOF", "LinePreUni", "LinePostUni", "MidAsciiLine"
867 			])
868 		{
869 			{
870 				enum testOffset = mixin("test" ~ test ~ "_offset");
871 				enum testByte = mixin("test" ~ test ~ "_byte");
872 				enum testPosition = mixin("test" ~ test ~ "_position");
873 
874 				writeln(" === Test ", test, " ===");
875 
876 				writeln(testByte, " byte -> offset ", testOffset);
877 				assert(testUnicodeDocument.bytesToOffset(testByte) == testOffset,
878 						"fail " ~ test ~ " byte->offset = " ~ testUnicodeDocument.bytesToOffset(testByte)
879 						.to!string);
880 				writeln(testByte, " byte -> position ", testPosition);
881 				assert(testUnicodeDocument.bytesToPosition(testByte) == testPosition,
882 						"fail " ~ test ~ " byte->position = " ~ testUnicodeDocument.bytesToPosition(testByte)
883 						.to!string);
884 
885 				writeln(testOffset, " offset -> byte ", testByte);
886 				assert(testUnicodeDocument.offsetToBytes(testOffset) == testByte,
887 						"fail " ~ test ~ " offset->byte = " ~ testUnicodeDocument.offsetToBytes(testOffset)
888 						.to!string);
889 				writeln(testOffset, " offset -> position ", testPosition);
890 				assert(testUnicodeDocument.offsetToPosition(testOffset) == testPosition,
891 						"fail " ~ test ~ " offset->position = " ~ testUnicodeDocument.offsetToPosition(testOffset)
892 						.to!string);
893 
894 				writeln(testPosition, " position -> offset ", testOffset);
895 				assert(testUnicodeDocument.positionToOffset(testPosition) == testOffset,
896 						"fail " ~ test ~ " position->offset = " ~ testUnicodeDocument.positionToOffset(testPosition)
897 						.to!string);
898 				writeln(testPosition, " position -> byte ", testByte);
899 				assert(testUnicodeDocument.positionToBytes(testPosition) == testByte,
900 						"fail " ~ test ~ " position->byte = " ~ testUnicodeDocument.positionToBytes(testPosition)
901 						.to!string);
902 
903 				writeln();
904 			}
905 		}
906 
907 		const size_t maxBytes = testEOF_byte;
908 		const size_t maxOffset = testEOF_offset;
909 		const Position maxPosition = testEOF_position;
910 
911 		writeln("max offset -> byte");
912 		assert(testUnicodeDocument.offsetToBytes(size_t.max) == maxBytes);
913 		writeln("max offset -> position");
914 		assert(testUnicodeDocument.offsetToPosition(size_t.max) == maxPosition);
915 		writeln("max byte -> offset");
916 		assert(testUnicodeDocument.bytesToOffset(size_t.max) == maxOffset);
917 		writeln("max byte -> position");
918 		assert(testUnicodeDocument.bytesToPosition(size_t.max) == maxPosition);
919 		writeln("max position -> offset");
920 		assert(testUnicodeDocument.positionToOffset(Position(uint.max, uint.max)) == maxOffset);
921 		writeln("max position -> byte");
922 		assert(testUnicodeDocument.positionToBytes(Position(uint.max, uint.max)) == maxBytes);
923 	}
924 
925 	version (none)
926 	@("character transform benchmarks")
927 	unittest
928 	{
929 		import std.datetime.stopwatch;
930 		import std.random;
931 		import std.stdio;
932 
933 		enum PositionCount = 32;
934 		size_t[PositionCount] testBytes;
935 		size_t[PositionCount] testOffsets;
936 		Position[PositionCount] testPositions;
937 
938 		static immutable funs = [
939 			"offsetToBytes", "offsetToPosition", "bytesToOffset", "bytesToPosition",
940 			"positionToOffset", "positionToBytes"
941 		];
942 
943 		size_t debugSum;
944 
945 		size_t lengthUtf16 = testUnicodeDocument.text.codeLength!wchar;
946 		enum TestRepeats = 10;
947 		Duration[TestRepeats][funs.length] times;
948 
949 		StopWatch sw;
950 		static foreach (iterations; [
951 				1e3, 1e4, /* 1e5 */
952 			])
953 		{
954 			writeln("==================");
955 			writeln("Timing ", iterations, "x", PositionCount, "x", TestRepeats, " iterations:");
956 			foreach (ref row; times)
957 				foreach (ref col; row)
958 					col = Duration.zero;
959 
960 			static foreach (t; 0 .. TestRepeats)
961 			{
962 				foreach (i, ref v; testOffsets)
963 				{
964 					v = uniform(0, lengthUtf16);
965 					testBytes[i] = testUnicodeDocument.offsetToBytes(v);
966 					testPositions[i] = testUnicodeDocument.offsetToPosition(v);
967 				}
968 				static foreach (fi, fun; funs)
969 				{
970 					sw.reset();
971 					sw.start();
972 					foreach (i; 0 .. iterations)
973 					{
974 						foreach (v; 0 .. PositionCount)
975 						{
976 							static if (fun[0] == 'b')
977 								mixin("debugSum |= testUnicodeDocument." ~ fun ~ "(testBytes[v]).sumVal;");
978 							else static if (fun[0] == 'o')
979 								mixin("debugSum |= testUnicodeDocument." ~ fun ~ "(testOffsets[v]).sumVal;");
980 							else static if (fun[0] == 'p')
981 								mixin("debugSum |= testUnicodeDocument." ~ fun ~ "(testPositions[v]).sumVal;");
982 							else
983 								static assert(false);
984 						}
985 					}
986 					sw.stop();
987 					times[fi][t] = sw.peek;
988 				}
989 			}
990 			static foreach (fi, fun; funs)
991 			{
992 				writeln(fun, ": ", formatDurationDistribution(times[fi]));
993 			}
994 			writeln();
995 			writeln();
996 		}
997 
998 		writeln("tricking the optimizer", debugSum);
999 	}
1000 
1001 	private pragma(inline, true) size_t sumVal(size_t v) pure @safe nothrow @nogc
1002 	{
1003 		return v;
1004 	}
1005 
1006 	private pragma(inline, true) size_t sumVal(Position v) pure @trusted nothrow @nogc
1007 	{
1008 		return cast(size_t)*(cast(ulong*)&v);
1009 	}
1010 
1011 	private string formatDurationDistribution(size_t n)(Duration[n] durs)
1012 	{
1013 		import std.algorithm : fold, map, sort, sum;
1014 		import std.format : format;
1015 		import std.math : sqrt;
1016 
1017 		Duration total = durs[].fold!"a+b";
1018 		sort!"a<b"(durs[]);
1019 		double msAvg = cast(double) total.total!"hnsecs" / 10_000.0 / n;
1020 		double msMedian = cast(double) durs[$ / 2].total!"hnsecs" / 10_000.0;
1021 		double[n] diffs = 0;
1022 		foreach (i, dur; durs)
1023 			diffs[i] = (cast(double) dur.total!"hnsecs" / 10_000.0) - msAvg;
1024 		double msStdDeviation = diffs[].map!"a*a".sum.sqrt;
1025 		return format!"[avg=%.4fms, median=%.4f, sd=%.4f]"(msAvg, msMedian, msStdDeviation);
1026 	}
1027 }