UTF-8编码规则

8 min readMay 28, 2020

将一个任意层级的包含 file文件以及中文字符的json 通过 blob 方式上传到服务器

javascript 上传入口

// In XHR‘s Header
{
...
'Content-Type': 'application/octet-stream',
...
}

File

step 1: 
  File => {fileName:'1.png',fileData:[BYTES DATA]}
step 2:
  ...123, 102, 105, 108, 101, 78, 97, 109, 101, 58, 39, 49, 46, 112, 110, 103, 39, 44, 102, 105, 108, 101, 68, 97, 116, 97, 58...

处理大于127字符的时候需要手动进行编码，以下是编码规则

Byte and bit

UTF-8 encoding rules

UTF-8使用一至六个字节为每个字符编码（尽管如此，2003年11月UTF-8被RFC 3629重新规范，只能使用原来Unicode定义的区域，U+0000到U+10FFFF，也就是说最多四个字节）

[7, 11, 16, 21, 26, 31].map(num => {
    console.log(num, 2 ** num)
});
// 7 128
// 11 2048
// 16 65536
// 21 2097152
// 26 67108864
// 31 2147483648'h'.charCodeAt(0)
// 104
104 < 128
  0x01101000'hello world!'.split('').map(item=>'0x'+(item.charCodeAt(0)).toString(2).padStart(8,0));
// 0b01101000
// 0b01100101
// 0b01101100
// 0b01101100
// 0b01101111
// 0b00100000
// 0b01110111
// 0b01101111
// 0b01110010
// 0b01101100
// 0b01100100
// 0b00100001
// 以上均小于127'中'.charCodeAt(0).toString(2)
// 100111000101101
// 2048 <= 20013 < 65536
// 使用三个 byte 表示
// 补全16位
// 0100111000101101
// 11100100 10111000 10101101
// 228      184      173// 中 reversed
// 228      184      173
(228).toString(2).padStart(8,0)
11100100
(184).toString(2).padStart(8,0)
10111000
(173).toString(2).padStart(8,0)
10101101// 11100100 10111000 10101101
// 00100111000101101
// 20013
// String.fromCharCode(20013)
// '中'

简单的实现

const UTF8Encode = ()=>{
 // decimal
 const unicode = char.charCodeAt(0)
 let bytes = []// 1 byte
 // 0xxxxxx
 if (unicode >= 0 && unicode <= 127) {
     console.log(1)
     bytes.push(unicode)
 }// 2 byte
 // 110xxxxx 10xxxxxx
 if (unicode >= 128 && unicode <= 2047) {
     // unicode >> 6
     // 去除最低六位的值
     // B & 0b111111
     // 保留最低五位的值
     // B | 0b10000000
     // 在前面补上110
     bytes.push(unicode >> 6 & 0b11111 | 0b11000000)// unicode & 0b111111
     // 保留最低六位的值
     // B | 0b10000000
     // 在前面补上10
     bytes.push(unicode & 0b111111 | 0b10000000)
 }// 3 byte
 // 1110xxxx 10xxxxxx 10xxxxxx
 if (unicode >= 2048 && unicode <= 65535) {
     bytes.push(unicode >> 12 & 0b1111 | 0b11100000)
     bytes.push(unicode >> 6 & 0b111111 | 0b10000000)
     bytes.push(unicode & 0b111111 | 0b10000000)
 }// 4 byte
 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 if (unicode >= 65536 && unicode <= 2097151) {
     bytes.push(unicode >> 18 & 0b111 | 0b11110000)
     bytes.push(unicode >> 12 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 6 & 0b111111 | 0b10000000)
     bytes.push(unicode & 0b111111 | 0b10000000)
 }// 5 byte
 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 if (unicode >= 2097152 && unicode <= 67108863) {
     bytes.push(unicode >> 24 & 0b11 | 0b11111000)
     bytes.push(unicode >> 18 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 12 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 6 & 0b111111 | 0b10000000)
     bytes.push(unicode & 0b111111 | 0b10000000)
 }// 6 byte
 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 if (unicode >= 67108864 && unicode <= 2147483647) {
     bytes.push(unicode >> 30 & 0b1 | 0b11111100)
     bytes.push(unicode >> 24 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 18 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 12 & 0b111111 | 0b10000000)
     bytes.push(unicode >> 6 & 0b111111 | 0b10000000)
     bytes.push(unicode & 0b111111 | 0b10000000)
 }
}// 简化得到
const UTF8Encode=>(char){
    // decimal
    const unicode = char.charCodeAt(0)
    let bytes = []if (unicode > 127) {
        if (unicode > 2047) {
            if (unicode > 65535) {
                if (unicode > 2097151) {
                    if (unicode > 67108863) {
                        // 6 byte
                        // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                        bytes.push(unicode >> 30 & 0b1 | 0b11111100)
                        bytes.push(unicode >> 24 & 0b111111 | 0b10000000)
                    } else {
                        // 5 byte
                        // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                        bytes.push(unicode >> 24 & 0b11 | 0b11111000)
                    }
                    // 5 6 common
                    bytes.push(unicode >> 18 & 0b111111 | 0b10000000)
                } else {
                    // 4 byte
                    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    bytes.push(unicode >> 18 & 0b111 | 0b11110000)
                }
                // 4 5 6 common
                bytes.push(unicode >> 12 & 0b111111 | 0b10000000)
            } else {
                // 3 byte
                bytes.push(unicode >> 12 & 0b1111 | 0b11100000)
            }
            // 3 4 5 6 common
            bytes.push(unicode >> 6 & 0b111111 | 0b10000000)
        } else {
            // 2 byte
            // 110xxxxx 10xxxxxx
            bytes.push(unicode >> 6 & 0b11111 | 0b11000000)
        }
        // 2 3 4 5 6 common
        bytes.push(unicode & 0b111111 | 0b10000000)
    } else {
        // 1 byte
        // 0xxxxxx
        bytes.push(unicode)
    }return bytes
}

Decode反向解码即可

此处未涵盖 emoji 的处理

UTF-8编码规则

Written by yahone chow