#include <iostream>
#include <string_view>
#include <bit>
using namespace std;
template<bool Validate = false, typename View>
requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
size_t utf8Width( View str )
{
size_t rem = str.end() - str.begin(), w = 0, chunk;
for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
{
chunk = countl_one( (unsigned char)*it );
chunk += (size_t)!chunk;
if constexpr( Validate )
if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
return -1;
auto end = it + chunk;
if constexpr( !Validate )
it = end;
else
while( ++it != end )
if( (*it & 0xC0u) != 0x80u )
return -1;
}
return w;
}
int main()
{
char8_t strU8[] = u8"Hello, ??!";
string_view sv( (char *)strU8 );
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}
On 11/19/2025 7:00 AM, Bonita Montero wrote:
#include <iostream>
#include <string_view>
#include <bit>
using namespace std;
template<bool Validate = false, typename View>
requires std::same_as<View, string_view> || std::same_as<View,
u8string_view>
size_t utf8Width( View str )
{
size_t rem = str.end() - str.begin(), w = 0, chunk;
for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
{
chunk = countl_one( (unsigned char)*it );
chunk += (size_t)!chunk;
if constexpr( Validate )
if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
[[unlikely]]
return -1;
auto end = it + chunk;
if constexpr( !Validate )
it = end;
else
while( ++it != end )
if( (*it & 0xC0u) != 0x80u )
return -1;
}
return w;
}
int main()
{
char8_t strU8[] = u8"Hello, ??!";
string_view sv( (char *)strU8 );
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}
Please run it for us !
Lynn
#include <iostream>
#include <string_view>
#include <bit>
using namespace std;
template<bool Validate = false, typename View>
requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
size_t utf8Width( View str )
{
size_t rem = str.end() - str.begin(), w = 0, chunk;
for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
{
chunk = countl_one( (unsigned char)*it );
chunk += (size_t)!chunk;
if constexpr( Validate )
if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
return -1;
auto end = it + chunk;
if constexpr( !Validate )
it = end;
else
while( ++it != end )
if( (*it & 0xC0u) != 0x80u )
return -1;
}
return w;
}
int main()
{
char8_t strU8[] = u8"Hello, ??!";
string_view sv( (char *)strU8 );
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}
On 11/19/2025 7:00 AM, Bonita Montero wrote:This code counts the number of UTF-8 characters.
#include <iostream>
#include <string_view>
#include <bit>
using namespace std;
template<bool Validate = false, typename View>
requires std::same_as<View, string_view> || std::same_as<View,
u8string_view>
size_t utf8Width( View str )
{
size_t rem = str.end() - str.begin(), w = 0, chunk;
for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
{
chunk = countl_one( (unsigned char)*it );
chunk += (size_t)!chunk;
if constexpr( Validate )
if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
[[unlikely]]
return -1;
auto end = it + chunk;
if constexpr( !Validate )
it = end;
else
while( ++it != end )
if( (*it & 0xC0u) != 0x80u )
return -1;
}
return w;
}
int main()
{
char8_t strU8[] = u8"Hello, ??!";
string_view sv( (char *)strU8 );
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}
So, why does strlen care if the text is ASCII or UTF8 ? Strlen is all
about the number of bytes used by the text string.
Now if you need the actual number of characters...
This code counts the number of UTF-8 characters.
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.
I have rewritten main() slightly:
[...]
void testCount(char8_t strU8[])
{
string_view sv( (char *)strU8 );
cout<< sv << endl;
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}
int main()
{
char8_t strU8[] = u8"Hello, ??!";
testCount( strU8 );
char8_t strU8_1[] = u8"V1: ''";
testCount(strU8_1);
char8_t strU8_2[] = u8"V2: 'a?o?u?'";
testCount(strU8_2);
char8_t strU8_3[] = u8"'??????????'";
testCount(strU8_3);
}
// End of Code
utf8len.cpp --- Size: 1,5 KB <https://www.dropbox.com/scl/fi/tiy4syytz5d0tvk3fau9g/utf8len.cpp?rlkey=4njtt4m1nvxdyr95pllvka3td&dl=0>
Result:
Screenshot 2025-11-23 124007.png --- Size: 58,4 KB <https://www.dropbox.com/scl/fi/4zccn7262qudthb88gnoi/Screenshot-2025-11-23-124007.png?rlkey=3xu71czkiooujj86mfmlnao15&dl=0>
Unfortunately, the console does not display the last character
correctly. But you can see it correctly on the left side within the editor.
Alfred
Am 23.11.2025 um 13:22 schrieb Alfred Peters:
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.
Each UTF-8 character is also a code point and is built with a varyingSome code points do not represent characters - they can be
number of characters.
On 23/11/2025 17:35, Bonita Montero wrote:
Am 23.11.2025 um 13:22 schrieb Alfred Peters:
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.
No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>
Some code points do not represent characters - they can beEveryone understood what I meant. And youre pettifoging now.
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
one code point. Some Unicode characters can be represented as either
a single code point or a combination (again, accented letters or other diacriticals are common cases here).
Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.
Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.
On 23/11/2025 17:35, Bonita Montero wrote:
Am 23.11.2025 um 13:22 schrieb Alfred Peters:
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.
No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>
Each UTF-8 character is also a code point and is built with a varying number of characters.Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
one code point. Some Unicode characters can be represented as either a single code point or a combination (again, accented letters or other diacriticals are common cases here).
Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.
Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.
Am 23.11.2025 um 18:18 schrieb David Brown:
On 23/11/2025 17:35, Bonita Montero wrote:
Am 23.11.2025 um 13:22 schrieb Alfred Peters:
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already mentioned, it >>>> only counts the number of code points.
No, it is the Unicode definition. See
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>
Idiot ...
A code-point is well defined, but a character can be an ASCII-character
and a Unicode code-point.
Some code points do not represent characters - they can beEveryone understood what I meant. And youre pettifoging now.
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining
diacritical marks. Some Unicode characters can be found at more than
one code point. Some Unicode characters can be represented as either
a single code point or a combination (again, accented letters or other
diacriticals are common cases here).
Trying to pin down what a Unicode "character" is turns out to be very
difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that Unicode
characters and Unicode code points are not synonymous.
Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.
On 24/11/2025 13:08, Bonita Montero wrote:
Am 23.11.2025 um 18:18 schrieb David Brown:
On 23/11/2025 17:35, Bonita Montero wrote:Idiot ...
Am 23.11.2025 um 13:22 schrieb Alfred Peters:
Es schrieb einmal Bonita Montero:That's your definition.
This code counts the number of UTF-8 characters.Unfortunately, no. As someone in the other group already
mentioned, it
only counts the number of code points.
No, it is the Unicode definition. See
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212> >>
Speak for yourself.
A code-point is well defined, but a character can be an
ASCII-character and a Unicode code-point.
Yes, Unicode code-points are well-defined. The term "character" is
not nearly as clear. But what /is/ clear is that "code point" and "character" do not mean the same thing.
Some code points do not represent characters - they can beEveryone understood what I meant. And youre pettifoging now.
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining
diacritical marks. Some Unicode characters can be found at more
than one code point. Some Unicode characters can be represented as
either a single code point or a combination (again, accented letters
or other diacriticals are common cases here).
I think it was clear that you meant "code points" when you wrote "characters". Fair enough. Counting Unicode characters is a much
more complicated matter, and not what you were trying to do.
But you jumped at Alfred when he correctly pointed out that your code
counts code points, not characters. I am merely informing you that
this is not Alfred's definition - it is the Unicode definition. The
/real/ definition in this context.
It's easy enough to make the mistake and write "character" when you
mean "code point". But when someone points out the mistake, accept
that correction rather than shooting the messenger.
Trying to pin down what a Unicode "character" is turns out to be
very difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that
Unicode characters and Unicode code points are not synonymous.
Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and
they don't tell you much about the size taken on screens or when
printed out.
I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't really characters in the literal
sense either, but that's what they're called.
On 24/11/2025 14:31, Bonita Montero wrote:There's no definition at all with that.
I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't really
characters in the literal
sense either, but that's what they're called.
That's /your/ definition.
Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead of picking an inaccurate name for it. What is your difficulty in using
the simple, clear and correct term "code point" instead of a vague and poorly defined alternative?
On 24/11/2025 14:31, Bonita Montero wrote:
I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.
That's /your/ definition.
Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead
of picking an inaccurate name for it. What is your difficulty in
using the simple, clear and correct term "code point" instead of a
vague and poorly defined alternative?
On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:
On 24/11/2025 14:31, Bonita Montero wrote:
I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.
That's /your/ definition.
Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead
of picking an inaccurate name for it. What is your difficulty in
using the simple, clear and correct term "code point" instead of a
vague and poorly defined alternative?
Do you happen to know what is represented by values of type 'char' in
Rust? :-)
On 24/11/2025 18:16, Michael S wrote:
On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:
On 24/11/2025 14:31, Bonita Montero wrote:
I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.
That's /your/ definition.
Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context.
You made a function for counting Unicode code points from a UTF8
string. That's great - it's a useful thing to do. Just call it
that instead of picking an inaccurate name for it. What is your
difficulty in using the simple, clear and correct term "code
point" instead of a vague and poorly defined alternative?
Do you happen to know what is represented by values of type 'char'
in Rust? :-)
No, sorry.
it's a useful thing to do.
On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:
it's a useful thing to do.
Is it?
| Sysop: | Tetrazocine |
|---|---|
| Location: | Melbourne, VIC, Australia |
| Users: | 14 |
| Nodes: | 8 (0 / 8) |
| Uptime: | 09:26:39 |
| Calls: | 184 |
| Files: | 21,502 |
| Messages: | 79,221 |