From: Peter Olcott on
I used the two tables from this link as the basis for my design:
http://en.wikipedia.org/wiki/UTF-8

I would like this reviewed for algorithm correctness:

void UnicodeEncodingConversion::
toUTF8(std::vector<uint32_t>& UTF32, std::vector<uint8_t>& UTF8) {
uint8_t Byte;
uint32_t CodePoint;
UTF8.reserve(UTF32.size() * 4); // worst case
for (uint32_t N = 0; N < UTF32.size(); N++) {
CodePoint = UTF32[N];

if (CodePoint <= 0x7F) {
Byte = CodePoint;
UTF8.push_back(Byte);
}
else if (CodePoint <= 0x7FF) {
Byte = 0xC0 | (CodePoint >> 6);
UTF8.push_back(Byte);
Byte = 0x80 | (CodePoint & 0x3F);
UTF8.push_back(Byte);
}
else if (CodePoint <= 0xFFFF) {
Byte = 0xE0 | (CodePoint >> 12);
UTF8.push_back(Byte);
Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
UTF8.push_back(Byte);
Byte = 0x80 | (CodePoint & 0x3F);
UTF8.push_back(Byte);
}
else if (CodePoint <= 0x10FFFF) {
Byte = 0xF0 | (CodePoint >> 18);
UTF8.push_back(Byte);
Byte = 0x80 | ((CodePoint >> 12) & 0x3F);
UTF8.push_back(Byte);
Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
UTF8.push_back(Byte);
Byte = 0x80 | (CodePoint & 0x3F);
UTF8.push_back(Byte);
}
else
printf("%d is outside of the Unicode range!\n", CodePoint);
}
}
From: Leigh Johnston on
"Peter Olcott" <NoSpam(a)OCR4Screen.com> wrote in message
news:t4SdneDjLJvMSJ7RnZ2dnUVZ_sudnZ2d(a)giganews.com...
> I used the two tables from this link as the basis for my design:
> http://en.wikipedia.org/wiki/UTF-8
>
> I would like this reviewed for algorithm correctness:
>
> void UnicodeEncodingConversion::
> toUTF8(std::vector<uint32_t>& UTF32, std::vector<uint8_t>& UTF8) {
> uint8_t Byte;
> uint32_t CodePoint;
> UTF8.reserve(UTF32.size() * 4); // worst case
> for (uint32_t N = 0; N < UTF32.size(); N++) {
> CodePoint = UTF32[N];
>
> if (CodePoint <= 0x7F) {
> Byte = CodePoint;
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0x7FF) {
> Byte = 0xC0 | (CodePoint >> 6);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0xFFFF) {
> Byte = 0xE0 | (CodePoint >> 12);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0x10FFFF) {
> Byte = 0xF0 | (CodePoint >> 18);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 12) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else
> printf("%d is outside of the Unicode range!\n", CodePoint);
> }
> }

Why on earth would you have such a function emit something to stdout?
Consider throwing an exception instead. Also printf sucks, this is a C++
newsgroup not a C newsgroup. I cannot be arsed reviewing the rest of your
algorithm as I generally don't do such things for free (for random people at
least). :)

/Leigh

From: Daniel T. on
Peter Olcott <NoSpam(a)OCR4Screen.com> wrote:

> I used the two tables from this link as the basis for my design:
> http://en.wikipedia.org/wiki/UTF-8

I suggest you use http://unicode.org/ for your source. Why use a
secondary source when the primary source is easily available?

> I would like this reviewed for algorithm correctness:

Surely your tests have already shown whether the algorithm is correct.

> void UnicodeEncodingConversion::
> toUTF8(std::vector<uint32_t>& UTF32, std::vector<uint8_t>& UTF8) {
> uint8_t Byte;
> uint32_t CodePoint;
> UTF8.reserve(UTF32.size() * 4); // worst case
> for (uint32_t N = 0; N < UTF32.size(); N++) {
> CodePoint = UTF32[N];

I suggest you use an iterator instead of an integer for the loop. That
way you wont need the extraneous variable.

> if (CodePoint <= 0x7F) {
> Byte = CodePoint;
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0x7FF) {
> Byte = 0xC0 | (CodePoint >> 6);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0xFFFF) {
> Byte = 0xE0 | (CodePoint >> 12);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else if (CodePoint <= 0x10FFFF) {

The codes 10FFFE and 10FFFF are guaranteed not to be unicode
characters...

> Byte = 0xF0 | (CodePoint >> 18);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 12) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
> UTF8.push_back(Byte);
> Byte = 0x80 | (CodePoint & 0x3F);
> UTF8.push_back(Byte);
> }
> else
> printf("%d is outside of the Unicode range!\n", CodePoint);

Throw is more appropriate here.

> }
> }
From: Leigh Johnston on
"Daniel T." <daniel_t(a)earthlink.net> wrote in message
news:daniel_t-B347BC.12353031052010(a)70-3-168-216.pools.spcsdns.net...
> Peter Olcott <NoSpam(a)OCR4Screen.com> wrote:
>> void UnicodeEncodingConversion::
>> toUTF8(std::vector<uint32_t>& UTF32, std::vector<uint8_t>& UTF8) {
>> uint8_t Byte;
>> uint32_t CodePoint;
>> UTF8.reserve(UTF32.size() * 4); // worst case
>> for (uint32_t N = 0; N < UTF32.size(); N++) {
>> CodePoint = UTF32[N];
>
> I suggest you use an iterator instead of an integer for the loop. That
> way you wont need the extraneous variable.
>

Then the iterator would be extraneous surely? Unless you mean CodePoint is
the extraneous variable which it isn't as it is accessed multiple times and
dereferencing an iterator multiple times would not be as efficient modulo
any compiler optimizations; it certainly is not as clear as using a
temporary (IMO).

/Leigh

From: Peter Olcott on
On 5/31/2010 10:49 AM, Leigh Johnston wrote:
> "Peter Olcott" <NoSpam(a)OCR4Screen.com> wrote in message
> news:t4SdneDjLJvMSJ7RnZ2dnUVZ_sudnZ2d(a)giganews.com...
>> I used the two tables from this link as the basis for my design:
>> http://en.wikipedia.org/wiki/UTF-8
>>
>> I would like this reviewed for algorithm correctness:
>>
>> void UnicodeEncodingConversion::
>> toUTF8(std::vector<uint32_t>& UTF32, std::vector<uint8_t>& UTF8) {
>> uint8_t Byte;
>> uint32_t CodePoint;
>> UTF8.reserve(UTF32.size() * 4); // worst case
>> for (uint32_t N = 0; N < UTF32.size(); N++) {
>> CodePoint = UTF32[N];
>>
>> if (CodePoint <= 0x7F) {
>> Byte = CodePoint;
>> UTF8.push_back(Byte);
>> }
>> else if (CodePoint <= 0x7FF) {
>> Byte = 0xC0 | (CodePoint >> 6);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | (CodePoint & 0x3F);
>> UTF8.push_back(Byte);
>> }
>> else if (CodePoint <= 0xFFFF) {
>> Byte = 0xE0 | (CodePoint >> 12);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | (CodePoint & 0x3F);
>> UTF8.push_back(Byte);
>> }
>> else if (CodePoint <= 0x10FFFF) {
>> Byte = 0xF0 | (CodePoint >> 18);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | ((CodePoint >> 12) & 0x3F);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | ((CodePoint >> 6) & 0x3F);
>> UTF8.push_back(Byte);
>> Byte = 0x80 | (CodePoint & 0x3F);
>> UTF8.push_back(Byte);
>> }
>> else
>> printf("%d is outside of the Unicode range!\n", CodePoint);
>> }
>> }
>
> Why on earth would you have such a function emit something to stdout?

Because it is a preliminary draft to be used to verify algorithm
correctness. I prefer to validate code for the command line.

> Consider throwing an exception instead. Also printf sucks, this is a C++
> newsgroup not a C newsgroup. I cannot be arsed reviewing the rest of
> your algorithm as I generally don't do such things for free (for random
> people at least). :)
>
> /Leigh