Forum: d0p3 BBS

A nice "strlen()" for UTF-8 ...

From Bonita Montero@3:633/10 to All on Wed Nov 19 14:00:22 2025

#include <iostream>
#include <string_view>
#include <bit>

using namespace std;

template<bool Validate = false, typename View>
� � requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
size_t utf8Width( View str )
{
� � size_t rem = str.end() - str.begin(), w = 0, chunk;
� � for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
� � {
� � � � chunk = countl_one( (unsigned char)*it );
� � � � chunk += (size_t)!chunk;
� � � � if constexpr( Validate )
� � � � � � if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
� � � � � � � � return -1;
� � � � auto end = it + chunk;
� � � � if constexpr( !Validate )
� � � � � � it = end;
� � � � else
� � � � � � while( ++it != end )
� � � � � � � � if( (*it & 0xC0u) != 0x80u )
� � � � � � � � � � return -1;
� � }
� � return w;
}

int main()
{
� � char8_t strU8[] = u8"Hello, ??!";
� � string_view sv( (char *)strU8 );
� � cout << utf8Width<false>( sv ) << endl;
� � cout << utf8Width<true>( sv ) << endl;
� � u8string_view svU8( strU8 );
� � cout << utf8Width<false>( svU8 ) << endl;
� � cout << utf8Width<true>( svU8 ) << endl;
}

--- PyGate Linux v1.5
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Lynn McGuire@3:633/10 to All on Wed Nov 19 12:59:14 2025

On 11/19/2025 7:00 AM, Bonita Montero wrote:

#include <iostream>
#include <string_view>
#include <bit>

using namespace std;

template<bool Validate = false, typename View>
� � requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
size_t utf8Width( View str )
{
� � size_t rem = str.end() - str.begin(), w = 0, chunk;
� � for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
� � {
� � � � chunk = countl_one( (unsigned char)*it );
� � � � chunk += (size_t)!chunk;
� � � � if constexpr( Validate )
� � � � � � if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
� � � � � � � � return -1;
� � � � auto end = it + chunk;
� � � � if constexpr( !Validate )
� � � � � � it = end;
� � � � else
� � � � � � while( ++it != end )
� � � � � � � � if( (*it & 0xC0u) != 0x80u )
� � � � � � � � � � return -1;
� � }
� � return w;
}

int main()
{
� � char8_t strU8[] = u8"Hello, ??!";
� � string_view sv( (char *)strU8 );
� � cout << utf8Width<false>( sv ) << endl;
� � cout << utf8Width<true>( sv ) << endl;
� � u8string_view svU8( strU8 );
� � cout << utf8Width<false>( svU8 ) << endl;
� � cout << utf8Width<true>( svU8 ) << endl;
}

Please run it for us !

Lynn

--- PyGate Linux v1.5
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Thu Nov 20 15:25:31 2025

Am 19.11.2025 um 19:59 schrieb Lynn McGuire:

On 11/19/2025 7:00 AM, Bonita Montero wrote:

#include <iostream>
#include <string_view>
#include <bit>

using namespace std;

template<bool Validate = false, typename View>
�� requires std::same_as<View, string_view> || std::same_as<View,
u8string_view>
size_t utf8Width( View str )
{
�� size_t rem = str.end() - str.begin(), w = 0, chunk;
�� for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
�� {
�� chunk = countl_one( (unsigned char)*it );
�� chunk += (size_t)!chunk;
�� if constexpr( Validate )
�� if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
[[unlikely]]
�� return -1;
�� auto end = it + chunk;
�� if constexpr( !Validate )
�� it = end;
�� else
�� while( ++it != end )
�� if( (*it & 0xC0u) != 0x80u )
�� return -1;
�� }
�� return w;
}

int main()
{
�� char8_t strU8[] = u8"Hello, ??!";
�� string_view sv( (char *)strU8 );
�� cout << utf8Width<false>( sv ) << endl;
�� cout << utf8Width<true>( sv ) << endl;
�� u8string_view svU8( strU8 );
�� cout << utf8Width<false>( svU8 ) << endl;
�� cout << utf8Width<true>( svU8 ) << endl;
}

Please run it for us !

Lynn

I tried that for AVX-512. If Validate is false that's easy.
But I didn't manage to do that efficiently if Validate is true;

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Lynn McGuire@3:633/10 to All on Fri Nov 21 00:15:30 2025

On 11/19/2025 7:00 AM, Bonita Montero wrote:

#include <iostream>
#include <string_view>
#include <bit>

using namespace std;

template<bool Validate = false, typename View>
� � requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
size_t utf8Width( View str )
{
� � size_t rem = str.end() - str.begin(), w = 0, chunk;
� � for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
� � {
� � � � chunk = countl_one( (unsigned char)*it );
� � � � chunk += (size_t)!chunk;
� � � � if constexpr( Validate )
� � � � � � if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
� � � � � � � � return -1;
� � � � auto end = it + chunk;
� � � � if constexpr( !Validate )
� � � � � � it = end;
� � � � else
� � � � � � while( ++it != end )
� � � � � � � � if( (*it & 0xC0u) != 0x80u )
� � � � � � � � � � return -1;
� � }
� � return w;
}

int main()
{
� � char8_t strU8[] = u8"Hello, ??!";
� � string_view sv( (char *)strU8 );
� � cout << utf8Width<false>( sv ) << endl;
� � cout << utf8Width<true>( sv ) << endl;
� � u8string_view svU8( strU8 );
� � cout << utf8Width<false>( svU8 ) << endl;
� � cout << utf8Width<true>( svU8 ) << endl;
}

So, why does strlen care if the text is ASCII or UTF8 ? Strlen is all
about the number of bytes used by the text string.

Now if you need the actual number of characters...

Lynn

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Fri Nov 21 10:42:27 2025

Am 21.11.2025 um 07:15 schrieb Lynn McGuire:

On 11/19/2025 7:00 AM, Bonita Montero wrote:

#include <iostream>
#include <string_view>
#include <bit>

using namespace std;

template<bool Validate = false, typename View>
�� requires std::same_as<View, string_view> || std::same_as<View,
u8string_view>
size_t utf8Width( View str )
{
�� size_t rem = str.end() - str.begin(), w = 0, chunk;
�� for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
�� {
�� chunk = countl_one( (unsigned char)*it );
�� chunk += (size_t)!chunk;
�� if constexpr( Validate )
�� if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
[[unlikely]]
�� return -1;
�� auto end = it + chunk;
�� if constexpr( !Validate )
�� it = end;
�� else
�� while( ++it != end )
�� if( (*it & 0xC0u) != 0x80u )
�� return -1;
�� }
�� return w;
}

int main()
{
�� char8_t strU8[] = u8"Hello, ??!";
�� string_view sv( (char *)strU8 );
�� cout << utf8Width<false>( sv ) << endl;
�� cout << utf8Width<true>( sv ) << endl;
�� u8string_view svU8( strU8 );
�� cout << utf8Width<false>( svU8 ) << endl;
�� cout << utf8Width<true>( svU8 ) << endl;
}

So, why does strlen care if the text is ASCII or UTF8 ?� Strlen is all
about the number of bytes used by the text string.
Now if you need the actual number of characters...

This code counts the number of UTF-8 characters.
This happens more often than counting the number of bytes of an
UTF8-string.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Alfred Peters@3:633/10 to All on Sun Nov 23 13:22:21 2025

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.

I have rewritten main() slightly:

[...]

void testCount(char8_t strU8[])
{
string_view sv( (char *)strU8 );
cout<< sv << endl;
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}

int main()
{
char8_t strU8[] = u8"Hello, ??!";
testCount( strU8 );

char8_t strU8_1[] = u8"V1: '��'";
testCount(strU8_1);

char8_t strU8_2[] = u8"V2: 'a?o?u?'";
testCount(strU8_2);

char8_t strU8_3[] = u8"'??????????'";
testCount(strU8_3);
}
// End of Code

utf8len.cpp --- Size: 1,5 KB <https://www.dropbox.com/scl/fi/tiy4syytz5d0tvk3fau9g/utf8len.cpp?rlkey=4njtt4m1nvxdyr95pllvka3td&dl=0>

Result:

Screenshot 2025-11-23 124007.png --- Size: 58,4 KB <https://www.dropbox.com/scl/fi/4zccn7262qudthb88gnoi/Screenshot-2025-11-23-124007.png?rlkey=3xu71czkiooujj86mfmlnao15&dl=0>

Unfortunately, the console does not display the last character
correctly. But you can see it correctly on the left side within the editor.

Alfred
--
??? ?? 25894.6
???
???????
?????

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Sun Nov 23 17:35:34 2025

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.

That's your definition.
Each UTF-8 character is also a code point and is built with a varying
number of characters.

I have rewritten main() slightly:

[...]

void testCount(char8_t strU8[])
{
string_view sv( (char *)strU8 );
cout<< sv << endl;
cout << utf8Width<false>( sv ) << endl;
cout << utf8Width<true>( sv ) << endl;
u8string_view svU8( strU8 );
cout << utf8Width<false>( svU8 ) << endl;
cout << utf8Width<true>( svU8 ) << endl;
}

int main()
{
char8_t strU8[] = u8"Hello, ??!";
testCount( strU8 );

char8_t strU8_1[] = u8"V1: '��'";
testCount(strU8_1);

char8_t strU8_2[] = u8"V2: 'a?o?u?'";
testCount(strU8_2);

char8_t strU8_3[] = u8"'??????????'";
testCount(strU8_3);
}
// End of Code

utf8len.cpp --- Size: 1,5 KB <https://www.dropbox.com/scl/fi/tiy4syytz5d0tvk3fau9g/utf8len.cpp?rlkey=4njtt4m1nvxdyr95pllvka3td&dl=0>

Result:

Screenshot 2025-11-23 124007.png --- Size: 58,4 KB <https://www.dropbox.com/scl/fi/4zccn7262qudthb88gnoi/Screenshot-2025-11-23-124007.png?rlkey=3xu71czkiooujj86mfmlnao15&dl=0>

Unfortunately, the console does not display the last character
correctly. But you can see it correctly on the left side within the editor.

Alfred

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From David Brown@3:633/10 to All on Sun Nov 23 18:18:42 2025

On 23/11/2025 17:35, Bonita Montero wrote:

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.

That's your definition.

No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

Each UTF-8 character is also a code point and is built with a varying
number of characters.

Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
one code point. Some Unicode characters can be represented as either a
single code point or a combination (again, accented letters or other diacriticals are common cases here).

Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Mon Nov 24 13:08:30 2025

Am 23.11.2025 um 18:18 schrieb David Brown:

On 23/11/2025 17:35, Bonita Montero wrote:

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.

That's your definition.

No, it is the Unicode definition.� See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

Idiot ...
A code-point is well defined, but a character can be an ASCII-character
and a Unicode code-point.

Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining diacritical marks.� Some Unicode characters can be found at more than
one code point.� Some Unicode characters can be represented as either
a single code point or a combination (again, accented letters or other diacriticals are common cases here).

Everyone understood what I meant. And youre pettifoging now.

Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
characters.� But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

Counting Unicode characters is a very difficult task.� Counting code
points is much simpler and more clearly specified.� Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Chris Vine@3:633/10 to All on Mon Nov 24 12:53:00 2025

On Sun, 23 Nov 2025 18:18:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

On 23/11/2025 17:35, Bonita Montero wrote:

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it
only counts the number of code points.

That's your definition.

No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

Each UTF-8 character is also a code point and is built with a varying number of characters.

Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
one code point. Some Unicode characters can be represented as either a single code point or a combination (again, accented letters or other diacriticals are common cases here).

Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified. Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.

It looks as if the original poster may be confusing in her mind a
unicode character with what the unicode standard refers to as a
"grapheme", and has ended up with a more or less useless function
counting code points.

Julia does in fact provide a function which counts graphemes
( https://riptutorial.com/julia-lang/example/20449/graphemes , which
makes a reasonable job of explaining the problem), and python has a
grapheme package available also, and probably some other languages do
too. However even counting graphemes and/or grapheme clusters doesn't necessarily help you with calculating sizes for screen display, and the
reward of doing this, such as it is, seems barely worth the effort (and potential inefficiency).

Chris

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From David Brown@3:633/10 to All on Mon Nov 24 14:09:01 2025

On 24/11/2025 13:08, Bonita Montero wrote:

Am 23.11.2025 um 18:18 schrieb David Brown:

On 23/11/2025 17:35, Bonita Montero wrote:

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already mentioned, it >>>> only counts the number of code points.

That's your definition.

No, it is the Unicode definition.� See
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

Idiot ...

Speak for yourself.

A code-point is well defined, but a character can be an ASCII-character
and a Unicode code-point.

Yes, Unicode code-points are well-defined. The term "character" is not
nearly as clear. But what /is/ clear is that "code point" and
"character" do not mean the same thing.

Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining
diacritical marks.� Some Unicode characters can be found at more than
one code point.� Some Unicode characters can be represented as either
a single code point or a combination (again, accented letters or other
diacriticals are common cases here).

Everyone understood what I meant. And youre pettifoging now.

I think it was clear that you meant "code points" when you wrote
"characters". Fair enough. Counting Unicode characters is a much more complicated matter, and not what you were trying to do.

But you jumped at Alfred when he correctly pointed out that your code
counts code points, not characters. I am merely informing you that this
is not Alfred's definition - it is the Unicode definition. The /real/ definition in this context.

It's easy enough to make the mistake and write "character" when you mean
"code point". But when someone points out the mistake, accept that
correction rather than shooting the messenger.

Trying to pin down what a Unicode "character" is turns out to be very
difficult, and does not match what people usually think of as
characters.� But one thing that we can be sure about, is that Unicode
characters and Unicode code points are not synonymous.

Counting Unicode characters is a very difficult task.� Counting code
points is much simpler and more clearly specified.� Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and they
don't tell you much about the size taken on screens or when printed out.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Mon Nov 24 14:31:05 2025

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't really characters in the literal
sense either, but that's what they're called. You're completely nuts.

Am 24.11.2025 um 14:09 schrieb David Brown:

On 24/11/2025 13:08, Bonita Montero wrote:

Am 23.11.2025 um 18:18 schrieb David Brown:

On 23/11/2025 17:35, Bonita Montero wrote:

Am 23.11.2025 um 13:22 schrieb Alfred Peters:

Es schrieb einmal Bonita Montero:

This code counts the number of UTF-8 characters.

Unfortunately, no. As someone in the other group already
mentioned, it
only counts the number of code points.

That's your definition.

No, it is the Unicode definition.� See
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212> >>

Idiot ...

Speak for yourself.

A code-point is well defined, but a character can be an
ASCII-character and a Unicode code-point.

Yes, Unicode code-points are well-defined.� The term "character" is
not nearly as clear.� But what /is/ clear is that "code point" and "character" do not mean the same thing.

Some code points do not represent characters - they can be
noncharacters, reserved, surrogates, etc. Some Unicode characters
require more than one code point, such as those that need combining
diacritical marks.� Some Unicode characters can be found at more
than one code point. Some Unicode characters can be represented as
either a single code point or a combination (again, accented letters
or other diacriticals are common cases here).

Everyone understood what I meant. And youre pettifoging now.

I think it was clear that you meant "code points" when you wrote "characters".� Fair enough.� Counting Unicode characters is a much
more complicated matter, and not what you were trying to do.

But you jumped at Alfred when he correctly pointed out that your code
counts code points, not characters.� I am merely informing you that
this is not Alfred's definition - it is the Unicode definition.� The
/real/ definition in this context.

It's easy enough to make the mistake and write "character" when you
mean "code point".� But when someone points out the mistake, accept
that correction rather than shooting the messenger.

Trying to pin down what a Unicode "character" is turns out to be
very difficult, and does not match what people usually think of as
characters.� But one thing that we can be sure about, is that
Unicode characters and Unicode code points are not synonymous.

Counting Unicode characters is a very difficult task. Counting code
points is much simpler and more clearly specified.� Neither task is
normally particularly useful - they don't tell you much about space
usage that can be helpful for things like memory allocation, and
they don't tell you much about the size taken on screens or when
printed out.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From David Brown@3:633/10 to All on Mon Nov 24 15:45:42 2025

On 24/11/2025 14:31, Bonita Montero wrote:

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't really characters in the literal
sense either, but that's what they're called.

That's /your/ definition.

Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead of picking an inaccurate name for it. What is your difficulty in using the simple, clear and correct term "code point" instead of a vague and
poorly defined alternative?

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Mon Nov 24 16:32:00 2025

Am 24.11.2025 um 15:45 schrieb David Brown:

On 24/11/2025 14:31, Bonita Montero wrote:

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't really
characters in the literal
sense either, but that's what they're called.

That's /your/ definition.

There's no definition at all with that.
You're utterly compulsive.

Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context.� You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead of picking an inaccurate name for it. What is your difficulty in using
the simple, clear and correct term "code point" instead of a vague and poorly defined alternative?

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Michael S@3:633/10 to All on Mon Nov 24 19:16:04 2025

On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

On 24/11/2025 14:31, Bonita Montero wrote:

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.

That's /your/ definition.

Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead
of picking an inaccurate name for it. What is your difficulty in
using the simple, clear and correct term "code point" instead of a
vague and poorly defined alternative?

Do you happen to know what is represented by values of type 'char' in
Rust? :-)

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From David Brown@3:633/10 to All on Mon Nov 24 18:32:13 2025

On 24/11/2025 18:16, Michael S wrote:

On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

On 24/11/2025 14:31, Bonita Montero wrote:

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.

That's /your/ definition.

Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context. You
made a function for counting Unicode code points from a UTF8 string.
That's great - it's a useful thing to do. Just call it that instead
of picking an inaccurate name for it. What is your difficulty in
using the simple, clear and correct term "code point" instead of a
vague and poorly defined alternative?

Do you happen to know what is represented by values of type 'char' in
Rust? :-)

No, sorry.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Michael S@3:633/10 to All on Mon Nov 24 19:51:47 2025

On Mon, 24 Nov 2025 18:32:13 +0100
David Brown <david.brown@hesbynett.no> wrote:

On 24/11/2025 18:16, Michael S wrote:

On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

On 24/11/2025 14:31, Bonita Montero wrote:

I don't understand your problem. You seem extremely obsessive. A
character can be anything,
a Unicode code point or an ASCII character. The latter aren't
really characters in the literal
sense either, but that's what they're called.

That's /your/ definition.

Use your own private definition if you want, but don't deride other
people for using correct definitions appropriate to the context.
You made a function for counting Unicode code points from a UTF8
string. That's great - it's a useful thing to do. Just call it
that instead of picking an inaccurate name for it. What is your
difficulty in using the simple, clear and correct term "code
point" instead of a vague and poorly defined alternative?

Do you happen to know what is represented by values of type 'char'
in Rust? :-)

No, sorry.

I thought that you can guess that it represents Unicode code point.
Of course, "The Book" pays lip servers: "However, a ?character?
?? isn?t
really a concept in Unicode, so your human intuition for what a
?character? is may not match up with what a char is in Rust
."
But who pays attention to lip service?
You can be reasonably sure that 99.99% of Rust programmers use the word 'character' in the meaning 'Unicode code point'.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Michael S@3:633/10 to All on Mon Nov 24 19:54:55 2025

On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

it's a useful thing to do.

Is it?

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From David Brown@3:633/10 to All on Mon Nov 24 21:14:30 2025

On 24/11/2025 18:54, Michael S wrote:

On Mon, 24 Nov 2025 15:45:42 +0100
David Brown <david.brown@hesbynett.no> wrote:

it's a useful thing to do.

Is it?

I don't think it is a /very/ useful thing, compared to things like
finding the length of the UTF-8 string in bytes, or the space needed to display the string. But I'm fairly sure people do use such functions.

--- PyGate Linux v1.5.1
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

Who's Online
Recent Visitors
- RufusT
  Tue Feb 17 12:05:47 2026
  from Dallas, TX via RLogin
- RufusT
  Wed Feb 18 01:50:00 2026
  from Dallas, TX via RLogin
- RufusT
  Wed Feb 25 05:02:02 2026
  from Dallas, TX via RLogin
- The God Farther
  Mon Mar 9 13:05:07 2026
  from Lake Ozark, Missouri. via Telnet

System Info

Sysop:	Tetrazocine
Location:	Melbourne, VIC, Australia
Users:	15
Nodes:	8 (0 / 8)
Uptime:	71:14:15
Calls:	208
Calls today:	1
Files:	21,502
Messages:	81,117

A nice "strlen()" for UTF-8 ...

Who's Online

Recent Visitors

System Info