• A nice "strlen()" for UTF-8 ...

    From Bonita Montero@3:633/10 to All on Wed Nov 19 14:00:22 2025
    #include <iostream>
    #include <string_view>
    #include <bit>

    using namespace std;

    template<bool Validate = false, typename View>
    requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
    size_t utf8Width( View str )
    {
    size_t rem = str.end() - str.begin(), w = 0, chunk;
    for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
    {
    chunk = countl_one( (unsigned char)*it );
    chunk += (size_t)!chunk;
    if constexpr( Validate )
    if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
    return -1;
    auto end = it + chunk;
    if constexpr( !Validate )
    it = end;
    else
    while( ++it != end )
    if( (*it & 0xC0u) != 0x80u )
    return -1;
    }
    return w;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    string_view sv( (char *)strU8 );
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }


    --- PyGate Linux v1.5
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lynn McGuire@3:633/10 to All on Wed Nov 19 12:59:14 2025
    On 11/19/2025 7:00 AM, Bonita Montero wrote:
    #include <iostream>
    #include <string_view>
    #include <bit>

    using namespace std;

    template<bool Validate = false, typename View>
    requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
    size_t utf8Width( View str )
    {
    size_t rem = str.end() - str.begin(), w = 0, chunk;
    for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
    {
    chunk = countl_one( (unsigned char)*it );
    chunk += (size_t)!chunk;
    if constexpr( Validate )
    if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
    return -1;
    auto end = it + chunk;
    if constexpr( !Validate )
    it = end;
    else
    while( ++it != end )
    if( (*it & 0xC0u) != 0x80u )
    return -1;
    }
    return w;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    string_view sv( (char *)strU8 );
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    Please run it for us !

    Lynn


    --- PyGate Linux v1.5
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Thu Nov 20 15:25:31 2025
    Am 19.11.2025 um 19:59 schrieb Lynn McGuire:
    On 11/19/2025 7:00 AM, Bonita Montero wrote:
    #include <iostream>
    #include <string_view>
    #include <bit>

    using namespace std;

    template<bool Validate = false, typename View>
    requires std::same_as<View, string_view> || std::same_as<View,
    u8string_view>
    size_t utf8Width( View str )
    {
    size_t rem = str.end() - str.begin(), w = 0, chunk;
    for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
    {
    chunk = countl_one( (unsigned char)*it );
    chunk += (size_t)!chunk;
    if constexpr( Validate )
    if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
    [[unlikely]]
    return -1;
    auto end = it + chunk;
    if constexpr( !Validate )
    it = end;
    else
    while( ++it != end )
    if( (*it & 0xC0u) != 0x80u )
    return -1;
    }
    return w;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    string_view sv( (char *)strU8 );
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    Please run it for us !

    Lynn

    I tried that for AVX-512. If Validate is false that's easy.
    But I didn't manage to do that efficiently if Validate is true;


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lynn McGuire@3:633/10 to All on Fri Nov 21 00:15:30 2025
    On 11/19/2025 7:00 AM, Bonita Montero wrote:
    #include <iostream>
    #include <string_view>
    #include <bit>

    using namespace std;

    template<bool Validate = false, typename View>
    requires std::same_as<View, string_view> || std::same_as<View, u8string_view>
    size_t utf8Width( View str )
    {
    size_t rem = str.end() - str.begin(), w = 0, chunk;
    for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
    {
    chunk = countl_one( (unsigned char)*it );
    chunk += (size_t)!chunk;
    if constexpr( Validate )
    if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk ) [[unlikely]]
    return -1;
    auto end = it + chunk;
    if constexpr( !Validate )
    it = end;
    else
    while( ++it != end )
    if( (*it & 0xC0u) != 0x80u )
    return -1;
    }
    return w;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    string_view sv( (char *)strU8 );
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    So, why does strlen care if the text is ASCII or UTF8 ? Strlen is all
    about the number of bytes used by the text string.

    Now if you need the actual number of characters...

    Lynn


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Fri Nov 21 10:42:27 2025
    Am 21.11.2025 um 07:15 schrieb Lynn McGuire:
    On 11/19/2025 7:00 AM, Bonita Montero wrote:
    #include <iostream>
    #include <string_view>
    #include <bit>

    using namespace std;

    template<bool Validate = false, typename View>
    requires std::same_as<View, string_view> || std::same_as<View,
    u8string_view>
    size_t utf8Width( View str )
    {
    size_t rem = str.end() - str.begin(), w = 0, chunk;
    for( auto it = str.begin(); rem; rem -= chunk, ++w ) [[likely]]
    {
    chunk = countl_one( (unsigned char)*it );
    chunk += (size_t)!chunk;
    if constexpr( Validate )
    if( (*it & 0xC0u) == 0x80u || chunk > 5 || rem < chunk )
    [[unlikely]]
    return -1;
    auto end = it + chunk;
    if constexpr( !Validate )
    it = end;
    else
    while( ++it != end )
    if( (*it & 0xC0u) != 0x80u )
    return -1;
    }
    return w;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    string_view sv( (char *)strU8 );
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    So, why does strlen care if the text is ASCII or UTF8 ? Strlen is all
    about the number of bytes used by the text string.
    Now if you need the actual number of characters...
    This code counts the number of UTF-8 characters.
    This happens more often than counting the number of bytes of an
    UTF8-string.

    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Alfred Peters@3:633/10 to All on Sun Nov 23 13:22:21 2025
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.

    Unfortunately, no. As someone in the other group already mentioned, it
    only counts the number of code points.

    I have rewritten main() slightly:

    [...]

    void testCount(char8_t strU8[])
    {
    string_view sv( (char *)strU8 );
    cout<< sv << endl;
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    testCount( strU8 );

    char8_t strU8_1[] = u8"V1: ''";
    testCount(strU8_1);

    char8_t strU8_2[] = u8"V2: 'a?o?u?'";
    testCount(strU8_2);

    char8_t strU8_3[] = u8"'??????????'";
    testCount(strU8_3);
    }
    // End of Code

    utf8len.cpp --- Size: 1,5 KB <https://www.dropbox.com/scl/fi/tiy4syytz5d0tvk3fau9g/utf8len.cpp?rlkey=4njtt4m1nvxdyr95pllvka3td&dl=0>

    Result:

    Screenshot 2025-11-23 124007.png --- Size: 58,4 KB <https://www.dropbox.com/scl/fi/4zccn7262qudthb88gnoi/Screenshot-2025-11-23-124007.png?rlkey=3xu71czkiooujj86mfmlnao15&dl=0>

    Unfortunately, the console does not display the last character
    correctly. But you can see it correctly on the left side within the editor.

    Alfred
    --
    ??? ?? 25894.6
    ???
    ???????
    ?????

    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Sun Nov 23 17:35:34 2025
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already mentioned, it
    only counts the number of code points.
    That's your definition.
    Each UTF-8 character is also a code point and is built with a varying
    number of characters.

    I have rewritten main() slightly:

    [...]

    void testCount(char8_t strU8[])
    {
    string_view sv( (char *)strU8 );
    cout<< sv << endl;
    cout << utf8Width<false>( sv ) << endl;
    cout << utf8Width<true>( sv ) << endl;
    u8string_view svU8( strU8 );
    cout << utf8Width<false>( svU8 ) << endl;
    cout << utf8Width<true>( svU8 ) << endl;
    }

    int main()
    {
    char8_t strU8[] = u8"Hello, ??!";
    testCount( strU8 );

    char8_t strU8_1[] = u8"V1: ''";
    testCount(strU8_1);

    char8_t strU8_2[] = u8"V2: 'a?o?u?'";
    testCount(strU8_2);

    char8_t strU8_3[] = u8"'??????????'";
    testCount(strU8_3);
    }
    // End of Code

    utf8len.cpp --- Size: 1,5 KB <https://www.dropbox.com/scl/fi/tiy4syytz5d0tvk3fau9g/utf8len.cpp?rlkey=4njtt4m1nvxdyr95pllvka3td&dl=0>

    Result:

    Screenshot 2025-11-23 124007.png --- Size: 58,4 KB <https://www.dropbox.com/scl/fi/4zccn7262qudthb88gnoi/Screenshot-2025-11-23-124007.png?rlkey=3xu71czkiooujj86mfmlnao15&dl=0>

    Unfortunately, the console does not display the last character
    correctly. But you can see it correctly on the left side within the editor.

    Alfred



    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From David Brown@3:633/10 to All on Sun Nov 23 18:18:42 2025
    On 23/11/2025 17:35, Bonita Montero wrote:
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already mentioned, it
    only counts the number of code points.
    That's your definition.

    No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

    Each UTF-8 character is also a code point and is built with a varying
    number of characters.
    Some code points do not represent characters - they can be
    noncharacters, reserved, surrogates, etc. Some Unicode characters
    require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
    one code point. Some Unicode characters can be represented as either a
    single code point or a combination (again, accented letters or other diacriticals are common cases here).

    Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
    characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

    Counting Unicode characters is a very difficult task. Counting code
    points is much simpler and more clearly specified. Neither task is
    normally particularly useful - they don't tell you much about space
    usage that can be helpful for things like memory allocation, and they
    don't tell you much about the size taken on screens or when printed out.


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Mon Nov 24 13:08:30 2025
    Am 23.11.2025 um 18:18 schrieb David Brown:
    On 23/11/2025 17:35, Bonita Montero wrote:
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already mentioned, it
    only counts the number of code points.
    That's your definition.

    No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>


    Idiot ...
    A code-point is well defined, but a character can be an ASCII-character
    and a Unicode code-point.

    Some code points do not represent characters - they can be
    noncharacters, reserved, surrogates, etc. Some Unicode characters
    require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
    one code point. Some Unicode characters can be represented as either
    a single code point or a combination (again, accented letters or other diacriticals are common cases here).
    Everyone understood what I meant. And youre pettifoging now.

    Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
    characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

    Counting Unicode characters is a very difficult task. Counting code
    points is much simpler and more clearly specified. Neither task is
    normally particularly useful - they don't tell you much about space
    usage that can be helpful for things like memory allocation, and they
    don't tell you much about the size taken on screens or when printed out.



    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Chris Vine@3:633/10 to All on Mon Nov 24 12:53:00 2025
    On Sun, 23 Nov 2025 18:18:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:
    On 23/11/2025 17:35, Bonita Montero wrote:
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already mentioned, it
    only counts the number of code points.
    That's your definition.

    No, it is the Unicode definition. See <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

    Each UTF-8 character is also a code point and is built with a varying number of characters.
    Some code points do not represent characters - they can be
    noncharacters, reserved, surrogates, etc. Some Unicode characters
    require more than one code point, such as those that need combining diacritical marks. Some Unicode characters can be found at more than
    one code point. Some Unicode characters can be represented as either a single code point or a combination (again, accented letters or other diacriticals are common cases here).

    Trying to pin down what a Unicode "character" is turns out to be very difficult, and does not match what people usually think of as
    characters. But one thing that we can be sure about, is that Unicode characters and Unicode code points are not synonymous.

    Counting Unicode characters is a very difficult task. Counting code
    points is much simpler and more clearly specified. Neither task is
    normally particularly useful - they don't tell you much about space
    usage that can be helpful for things like memory allocation, and they
    don't tell you much about the size taken on screens or when printed out.

    It looks as if the original poster may be confusing in her mind a
    unicode character with what the unicode standard refers to as a
    "grapheme", and has ended up with a more or less useless function
    counting code points.

    Julia does in fact provide a function which counts graphemes
    ( https://riptutorial.com/julia-lang/example/20449/graphemes , which
    makes a reasonable job of explaining the problem), and python has a
    grapheme package available also, and probably some other languages do
    too. However even counting graphemes and/or grapheme clusters doesn't necessarily help you with calculating sizes for screen display, and the
    reward of doing this, such as it is, seems barely worth the effort (and potential inefficiency).

    Chris

    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From David Brown@3:633/10 to All on Mon Nov 24 14:09:01 2025
    On 24/11/2025 13:08, Bonita Montero wrote:
    Am 23.11.2025 um 18:18 schrieb David Brown:
    On 23/11/2025 17:35, Bonita Montero wrote:
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already mentioned, it >>>> only counts the number of code points.
    That's your definition.

    No, it is the Unicode definition. See
    <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212>

    Idiot ...

    Speak for yourself.

    A code-point is well defined, but a character can be an ASCII-character
    and a Unicode code-point.

    Yes, Unicode code-points are well-defined. The term "character" is not
    nearly as clear. But what /is/ clear is that "code point" and
    "character" do not mean the same thing.


    Some code points do not represent characters - they can be
    noncharacters, reserved, surrogates, etc. Some Unicode characters
    require more than one code point, such as those that need combining
    diacritical marks. Some Unicode characters can be found at more than
    one code point. Some Unicode characters can be represented as either
    a single code point or a combination (again, accented letters or other
    diacriticals are common cases here).
    Everyone understood what I meant. And youre pettifoging now.

    I think it was clear that you meant "code points" when you wrote
    "characters". Fair enough. Counting Unicode characters is a much more complicated matter, and not what you were trying to do.

    But you jumped at Alfred when he correctly pointed out that your code
    counts code points, not characters. I am merely informing you that this
    is not Alfred's definition - it is the Unicode definition. The /real/ definition in this context.

    It's easy enough to make the mistake and write "character" when you mean
    "code point". But when someone points out the mistake, accept that
    correction rather than shooting the messenger.


    Trying to pin down what a Unicode "character" is turns out to be very
    difficult, and does not match what people usually think of as
    characters. But one thing that we can be sure about, is that Unicode
    characters and Unicode code points are not synonymous.

    Counting Unicode characters is a very difficult task. Counting code
    points is much simpler and more clearly specified. Neither task is
    normally particularly useful - they don't tell you much about space
    usage that can be helpful for things like memory allocation, and they
    don't tell you much about the size taken on screens or when printed out.




    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Mon Nov 24 14:31:05 2025
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't really characters in the literal
    sense either, but that's what they're called. You're completely nuts.


    Am 24.11.2025 um 14:09 schrieb David Brown:
    On 24/11/2025 13:08, Bonita Montero wrote:
    Am 23.11.2025 um 18:18 schrieb David Brown:
    On 23/11/2025 17:35, Bonita Montero wrote:
    Am 23.11.2025 um 13:22 schrieb Alfred Peters:
    Es schrieb einmal Bonita Montero:

    This code counts the number of UTF-8 characters.
    Unfortunately, no. As someone in the other group already
    mentioned, it
    only counts the number of code points.
    That's your definition.

    No, it is the Unicode definition. See
    <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G2212> >>
    Idiot ...

    Speak for yourself.

    A code-point is well defined, but a character can be an
    ASCII-character and a Unicode code-point.

    Yes, Unicode code-points are well-defined. The term "character" is
    not nearly as clear. But what /is/ clear is that "code point" and "character" do not mean the same thing.


    Some code points do not represent characters - they can be
    noncharacters, reserved, surrogates, etc. Some Unicode characters
    require more than one code point, such as those that need combining
    diacritical marks. Some Unicode characters can be found at more
    than one code point. Some Unicode characters can be represented as
    either a single code point or a combination (again, accented letters
    or other diacriticals are common cases here).
    Everyone understood what I meant. And youre pettifoging now.

    I think it was clear that you meant "code points" when you wrote "characters". Fair enough. Counting Unicode characters is a much
    more complicated matter, and not what you were trying to do.

    But you jumped at Alfred when he correctly pointed out that your code
    counts code points, not characters. I am merely informing you that
    this is not Alfred's definition - it is the Unicode definition. The
    /real/ definition in this context.

    It's easy enough to make the mistake and write "character" when you
    mean "code point". But when someone points out the mistake, accept
    that correction rather than shooting the messenger.


    Trying to pin down what a Unicode "character" is turns out to be
    very difficult, and does not match what people usually think of as
    characters. But one thing that we can be sure about, is that
    Unicode characters and Unicode code points are not synonymous.

    Counting Unicode characters is a very difficult task. Counting code
    points is much simpler and more clearly specified. Neither task is
    normally particularly useful - they don't tell you much about space
    usage that can be helpful for things like memory allocation, and
    they don't tell you much about the size taken on screens or when
    printed out.





    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From David Brown@3:633/10 to All on Mon Nov 24 15:45:42 2025
    On 24/11/2025 14:31, Bonita Montero wrote:
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't really characters in the literal
    sense either, but that's what they're called.

    That's /your/ definition.

    Use your own private definition if you want, but don't deride other
    people for using correct definitions appropriate to the context. You
    made a function for counting Unicode code points from a UTF8 string.
    That's great - it's a useful thing to do. Just call it that instead of picking an inaccurate name for it. What is your difficulty in using the simple, clear and correct term "code point" instead of a vague and
    poorly defined alternative?


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Mon Nov 24 16:32:00 2025
    Am 24.11.2025 um 15:45 schrieb David Brown:
    On 24/11/2025 14:31, Bonita Montero wrote:
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't really
    characters in the literal
    sense either, but that's what they're called.

    That's /your/ definition.
    There's no definition at all with that.
    You're utterly compulsive.

    Use your own private definition if you want, but don't deride other
    people for using correct definitions appropriate to the context. You
    made a function for counting Unicode code points from a UTF8 string.
    That's great - it's a useful thing to do. Just call it that instead of picking an inaccurate name for it. What is your difficulty in using
    the simple, clear and correct term "code point" instead of a vague and poorly defined alternative?



    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Michael S@3:633/10 to All on Mon Nov 24 19:16:04 2025
    On Mon, 24 Nov 2025 15:45:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    On 24/11/2025 14:31, Bonita Montero wrote:
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't
    really characters in the literal
    sense either, but that's what they're called.

    That's /your/ definition.

    Use your own private definition if you want, but don't deride other
    people for using correct definitions appropriate to the context. You
    made a function for counting Unicode code points from a UTF8 string.
    That's great - it's a useful thing to do. Just call it that instead
    of picking an inaccurate name for it. What is your difficulty in
    using the simple, clear and correct term "code point" instead of a
    vague and poorly defined alternative?


    Do you happen to know what is represented by values of type 'char' in
    Rust? :-)


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From David Brown@3:633/10 to All on Mon Nov 24 18:32:13 2025
    On 24/11/2025 18:16, Michael S wrote:
    On Mon, 24 Nov 2025 15:45:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    On 24/11/2025 14:31, Bonita Montero wrote:
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't
    really characters in the literal
    sense either, but that's what they're called.

    That's /your/ definition.

    Use your own private definition if you want, but don't deride other
    people for using correct definitions appropriate to the context. You
    made a function for counting Unicode code points from a UTF8 string.
    That's great - it's a useful thing to do. Just call it that instead
    of picking an inaccurate name for it. What is your difficulty in
    using the simple, clear and correct term "code point" instead of a
    vague and poorly defined alternative?


    Do you happen to know what is represented by values of type 'char' in
    Rust? :-)


    No, sorry.


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Michael S@3:633/10 to All on Mon Nov 24 19:51:47 2025
    On Mon, 24 Nov 2025 18:32:13 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    On 24/11/2025 18:16, Michael S wrote:
    On Mon, 24 Nov 2025 15:45:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    On 24/11/2025 14:31, Bonita Montero wrote:
    I don't understand your problem. You seem extremely obsessive. A
    character can be anything,
    a Unicode code point or an ASCII character. The latter aren't
    really characters in the literal
    sense either, but that's what they're called.

    That's /your/ definition.

    Use your own private definition if you want, but don't deride other
    people for using correct definitions appropriate to the context.
    You made a function for counting Unicode code points from a UTF8
    string. That's great - it's a useful thing to do. Just call it
    that instead of picking an inaccurate name for it. What is your
    difficulty in using the simple, clear and correct term "code
    point" instead of a vague and poorly defined alternative?


    Do you happen to know what is represented by values of type 'char'
    in Rust? :-)


    No, sorry.


    I thought that you can guess that it represents Unicode code point.
    Of course, "The Book" pays lip servers: "However, a ?character?
    ?? isn?t
    really a concept in Unicode, so your human intuition for what a
    ?character? is may not match up with what a char is in Rust
    ."
    But who pays attention to lip service?
    You can be reasonably sure that 99.99% of Rust programmers use the word 'character' in the meaning 'Unicode code point'.






    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Michael S@3:633/10 to All on Mon Nov 24 19:54:55 2025
    On Mon, 24 Nov 2025 15:45:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    it's a useful thing to do.

    Is it?




    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From David Brown@3:633/10 to All on Mon Nov 24 21:14:30 2025
    On 24/11/2025 18:54, Michael S wrote:
    On Mon, 24 Nov 2025 15:45:42 +0100
    David Brown <david.brown@hesbynett.no> wrote:

    it's a useful thing to do.

    Is it?


    I don't think it is a /very/ useful thing, compared to things like
    finding the length of the UTF-8 string in bytes, or the space needed to display the string. But I'm fairly sure people do use such functions.


    --- PyGate Linux v1.5.1
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)