@@ -45,8 +45,10 @@ function StringDecoder(encoding) {
4545 case 'utf16le':
4646 this.text = utf16Text;
4747 this.end = utf16End;
48- // fall through
48+ nb = 4;
49+ break;
4950 case 'utf8':
51+ this.fillLast = utf8FillLast;
5052 nb = 4;
5153 break;
5254 case 'base64':
@@ -88,7 +90,7 @@ StringDecoder.prototype.end = utf8End;
8890// Returns only complete characters in a Buffer
8991StringDecoder.prototype.text = utf8Text;
9092
91- // Attempts to complete a partial character using bytes from a Buffer
93+ // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
9294StringDecoder.prototype.fillLast = function(buf) {
9395 if (this.lastNeed <= buf.length) {
9496 buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@@ -112,38 +114,83 @@ function utf8CheckByte(byte) {
112114 return -1;
113115}
114116
115- // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
116- // character, returning the total number of bytes needed to complete the partial
117- // character (if applicable).
117+ // Checks at most 3 bytes at the end of a Buffer in order to detect an
118+ // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
119+ // needed to complete the UTF-8 character (if applicable) are returned .
118120function utf8CheckIncomplete(self, buf, i) {
119121 var j = buf.length - 1;
120122 if (j < i)
121123 return 0;
122- var nb = utf8CheckByte(buf[j-- ]);
124+ var nb = utf8CheckByte(buf[j]);
123125 if (nb >= 0) {
124126 if (nb > 0)
125- self.lastNeed = nb + 1 - (buf.length - j) ;
127+ self.lastNeed = nb - 1 ;
126128 return nb;
127129 }
128- if (j < i)
130+ if (-- j < i)
129131 return 0;
130- nb = utf8CheckByte(buf[j-- ]);
132+ nb = utf8CheckByte(buf[j]);
131133 if (nb >= 0) {
132134 if (nb > 0)
133- self.lastNeed = nb + 1 - (buf.length - j) ;
135+ self.lastNeed = nb - 2 ;
134136 return nb;
135137 }
136- if (j < i)
138+ if (-- j < i)
137139 return 0;
138- nb = utf8CheckByte(buf[j-- ]);
140+ nb = utf8CheckByte(buf[j]);
139141 if (nb >= 0) {
140- if (nb > 0)
141- self.lastNeed = nb + 1 - (buf.length - j);
142+ if (nb > 0) {
143+ if (nb === 2)
144+ nb = 0;
145+ else
146+ self.lastNeed = nb - 3;
147+ }
142148 return nb;
143149 }
144150 return 0;
145151}
146152
153+ // Validates as many continuation bytes for a multi-byte UTF-8 character as
154+ // needed or are available. If we see a non-continuation byte where we expect
155+ // one, we "replace" the validated continuation bytes we've seen so far with
156+ // UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
157+ // behavior. The continuation byte check is included three times in the case
158+ // where all of the continuation bytes for a character exist in the same buffer.
159+ // It is also done this way as a slight performance increase instead of using a
160+ // loop.
161+ function utf8CheckExtraBytes(self, buf, p) {
162+ if ((buf[0] & 0xC0) !== 0x80) {
163+ self.lastNeed = 0;
164+ return '\ufffd'.repeat(p);
165+ }
166+ if (self.lastNeed > 1 && buf.length > 1) {
167+ if ((buf[1] & 0xC0) !== 0x80) {
168+ self.lastNeed = 1;
169+ return '\ufffd'.repeat(p + 1);
170+ }
171+ if (self.lastNeed > 2 && buf.length > 2) {
172+ if ((buf[2] & 0xC0) !== 0x80) {
173+ self.lastNeed = 2;
174+ return '\ufffd'.repeat(p + 2);
175+ }
176+ }
177+ }
178+ }
179+
180+ // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
181+ function utf8FillLast(buf) {
182+ const p = this.lastTotal - this.lastNeed;
183+ var r = utf8CheckExtraBytes(this, buf, p);
184+ if (r !== undefined)
185+ return r;
186+ if (this.lastNeed <= buf.length) {
187+ buf.copy(this.lastChar, p, 0, this.lastNeed);
188+ return this.lastChar.toString(this.encoding, 0, this.lastTotal);
189+ }
190+ buf.copy(this.lastChar, p, 0, buf.length);
191+ this.lastNeed -= buf.length;
192+ }
193+
147194// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
148195// partial character, the character's bytes are buffered until the required
149196// number of bytes are available.
0 commit comments