Add helpers to truncate UTF-8 at code points

This will help to avoid truncating a UTF-8 string in the middle of a
code point, producing an invalid UTF-8 result.
This commit is contained in:
Romain Vimont 2019-05-30 19:01:08 +02:00
commit 0a7fe7ad57
6 changed files with 121 additions and 1 deletions

View file

@ -0,0 +1,23 @@
package com.genymobile.scrcpy;
public final class StringUtils {
private StringUtils() {
// not instantiable
}
@SuppressWarnings("checkstyle:MagicNumber")
public static int getUtf8TruncationIndex(byte[] utf8, int maxLength) {
int len = utf8.length;
if (len <= maxLength) {
return len;
}
len = maxLength;
// see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
// the next byte is not the start of a new UTF-8 codepoint
// so if we would cut there, the character would be truncated
len--;
}
return len;
}
}

View file

@ -0,0 +1,44 @@
package com.genymobile.scrcpy;
import junit.framework.Assert;
import org.junit.Test;
import java.nio.charset.StandardCharsets;
public class StringUtilsTest {
@Test
@SuppressWarnings("checkstyle:MagicNumber")
public void testUtf8Trucate() {
String s = "aÉbÔc";
byte[] utf8 = s.getBytes(StandardCharsets.UTF_8);
Assert.assertEquals(7, utf8.length);
int count;
count = StringUtils.getUtf8TruncationIndex(utf8, 1);
Assert.assertEquals(1, count);
count = StringUtils.getUtf8TruncationIndex(utf8, 2);
Assert.assertEquals(1, count); // É is 2 bytes-wide
count = StringUtils.getUtf8TruncationIndex(utf8, 3);
Assert.assertEquals(3, count);
count = StringUtils.getUtf8TruncationIndex(utf8, 4);
Assert.assertEquals(4, count);
count = StringUtils.getUtf8TruncationIndex(utf8, 5);
Assert.assertEquals(4, count); // Ô is 2 bytes-wide
count = StringUtils.getUtf8TruncationIndex(utf8, 6);
Assert.assertEquals(6, count);
count = StringUtils.getUtf8TruncationIndex(utf8, 7);
Assert.assertEquals(7, count);
count = StringUtils.getUtf8TruncationIndex(utf8, 8);
Assert.assertEquals(7, count); // no more chars
}
}