|
| 1 | +''' |
| 2 | +A character in UTF8 can be from 1 to 4 bytes long, subjected to the following rules: |
| 3 | +
|
| 4 | +For 1-byte character, the first bit is a 0, followed by its unicode code. |
| 5 | +For n-bytes character, the first n-bits are all one's, the n+1 bit is 0, followed by n-1 bytes with most significant 2 bits being 10. |
| 6 | +This is how the UTF-8 encoding would work: |
| 7 | +
|
| 8 | + Char. number range | UTF-8 octet sequence |
| 9 | + (hexadecimal) | (binary) |
| 10 | + --------------------+--------------------------------------------- |
| 11 | + 0000 0000-0000 007F | 0xxxxxxx |
| 12 | + 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
| 13 | + 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
| 14 | + 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 15 | +Given an array of integers representing the data, return whether it is a valid utf-8 encoding. |
| 16 | +
|
| 17 | +Note: |
| 18 | +The input is an array of integers. Only the least significant 8 bits of each integer is used to store the data. This means each integer represents only 1 byte of data. |
| 19 | +
|
| 20 | +Example 1: |
| 21 | +
|
| 22 | +data = [197, 130, 1], which represents the octet sequence: 11000101 10000010 00000001. |
| 23 | +
|
| 24 | +Return true. |
| 25 | +It is a valid utf-8 encoding for a 2-bytes character followed by a 1-byte character. |
| 26 | +Example 2: |
| 27 | +
|
| 28 | +data = [235, 140, 4], which represented the octet sequence: 11101011 10001100 00000100. |
| 29 | +
|
| 30 | +Return false. |
| 31 | +The first 3 bits are all one's and the 4th bit is 0 means it is a 3-bytes character. |
| 32 | +The next byte is a continuation byte which starts with 10 and that's correct. |
| 33 | +But the second continuation byte does not start with 10, so it is invalid. |
| 34 | +''' |
| 35 | + |
| 36 | +class Solution(object): |
| 37 | + def validUtf8(self, data): |
| 38 | + """ |
| 39 | + :type data: List[int] |
| 40 | + :rtype: bool |
| 41 | + """ |
| 42 | + seveneth_mask = 1 << 7 |
| 43 | + sixth_mask = 1 << 6 |
| 44 | + no_bytes = 0 |
| 45 | + |
| 46 | + if len(data) == 1: |
| 47 | + return not(data[0] & seveneth_mask) |
| 48 | + |
| 49 | + for num in data: |
| 50 | + if no_bytes == 0: |
| 51 | + mask = 1 << 7 |
| 52 | + |
| 53 | + while num & mask: |
| 54 | + no_bytes += 1 |
| 55 | + mask >>= 1 |
| 56 | + |
| 57 | + if no_bytes == 0: |
| 58 | + continue |
| 59 | + |
| 60 | + if no_bytes == 1 or no_bytes > 4: |
| 61 | + return False |
| 62 | + else: |
| 63 | + if not(num & seveneth_mask and not(num & sixth_mask)): |
| 64 | + return False |
| 65 | + no_bytes -= 1 |
| 66 | + return no_bytes == 0 |
0 commit comments