• Re: is_binary_file()

    From Michael Sanders@3:633/10 to All on Tue Dec 16 00:26:29 2025
    On Fri, 12 Dec 2025 15:33:01 -0800, Chris M. Thomasson wrote:

    define the probability? Say in 0...1?

    Probabilities for Chris... Enjoy.

    #include <stdio.h>
    #include <stdint.h>
    #include <sys/stat.h>

    /*
    * map_strict[]
    *
    * Valid bytes:
    * - ASCII printable: 0x20?0x7E
    * - ISO-8859-1 high printable: 0xA0?0xFF
    * - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
    *
    * Invalid bytes (binary indicators):
    * - NULL byte (0x00)
    * - C0 controls (0x01?0x08, 0x0B?0x0C, 0x0E?0x1F)
    * - DEL (0x7F)
    * - C1 controls (0x80?0x9F)
    */

    static const uint8_t map_strict[256] = {
    0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, // 70
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 80
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 90
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
    };

    /*
    * map_loose[]
    *
    * Valid bytes:
    * - ASCII printable characters: 0x20?0x7E
    * - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
    * - High bytes: 0x80?0xFF
    *
    * Invalid bytes (binary indicators):
    * - NULL byte: 0x00
    * - C0 control codes: 0x01?0x08, 0x0B?0x0C, 0x0E?0x1F
    * - DEL character: 0x7F
    */

    static const uint8_t map_loose[256] = {
    0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 70
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 80
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 90
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
    };

    /*
    * is_text_file(const char *path, const uint8_t map[256], double probability)
    *
    * Determines whether a file is likely a text file driven by a byte map
    * & probability threshold.
    *
    * Signature:
    *
    * path
    * Pathname of the file to examine. The function follows symlinks
    * & only operates on regular files.
    *
    * map[256]
    * A 256-entry lookup table indexed by byte value (0?255).
    * Each entry must be either:
    * 1 -> byte is considered valid text
    * 0 -> byte is considered invalid (binary indicator)
    *
    * probability
    * Maximum allowed percentage (0.0 to 100.0) of invalid bytes.
    * If the percentage of bytes for which map[byte] == 0 exceeds
    * this value, the file is classified as non-text.
    *
    * Behavior:
    * - Reads the file in binary mode.
    * - Counts total bytes & 'bad' bytes (map[] == 0).
    * - Returns 1 if bad byte percentage <= probability.
    * - Returns 0 if bad byte percentage > probability.
    * - Returns -1 on access/open errors or if the file is not regular.
    * - Function exits early if impossible to satisfy probability threshold.
    *
    * Returns:
    * 1 - text
    * 0 - binary indicator
    * -1 - could not open
    *
    * Example with 94.3% probability threshold:
    * is_text_file("./example.txt", your_map[256], 94.3)
    *
    */

    int is_text_file(const char *path, const uint8_t map[256], double probability) {

    struct stat st;
    if (stat(path, &st) != 0) return -1; // can not access file
    if (!S_ISREG(st.st_mode)) return -1; // reject: directories/devices/sockets
    FILE *f = fopen(path, "rb"); if (!f) return -1; // could not open file

    // 4KB: 4096, 8KB: 8192, 16KB: 16384, 32KB: 32768, 64KB: 65536
    unsigned char buf[16384];
    size_t n, i;
    size_t total = 0;
    size_t bad = 0;

    while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
    for (i = 0; i < n; i++) {
    total++;
    if (!map[buf[i]]) bad++;
    if (((double)bad / total) * 100.0 > probability) {
    fclose(f);
    return 0; // not text
    }
    }
    }

    fclose(f);

    if (total == 0) return 0; // empty file or total binary...

    // final probability check
    return (((double)bad / total) * 100.0 <= probability) ? 1 : 0;
    }

    --
    :wq
    Mike Sanders

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Tue Dec 16 17:24:49 2025
    How long have you been dealing with that ? 10 days ?
    I finished the AVX-512 version in C++ in two hours.

    Am 16.12.2025 um 01:26 schrieb Michael Sanders:
    On Fri, 12 Dec 2025 15:33:01 -0800, Chris M. Thomasson wrote:

    define the probability? Say in 0...1?
    Probabilities for Chris... Enjoy.

    #include <stdio.h>
    #include <stdint.h>
    #include <sys/stat.h>

    /*
    * map_strict[]
    *
    * Valid bytes:
    * - ASCII printable: 0x20?0x7E
    * - ISO-8859-1 high printable: 0xA0?0xFF
    * - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
    *
    * Invalid bytes (binary indicators):
    * - NULL byte (0x00)
    * - C0 controls (0x01?0x08, 0x0B?0x0C, 0x0E?0x1F)
    * - DEL (0x7F)
    * - C1 controls (0x80?0x9F)
    */

    static const uint8_t map_strict[256] = {
    0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, // 70
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 80
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 90
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
    };

    /*
    * map_loose[]
    *
    * Valid bytes:
    * - ASCII printable characters: 0x20?0x7E
    * - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
    * - High bytes: 0x80?0xFF
    *
    * Invalid bytes (binary indicators):
    * - NULL byte: 0x00
    * - C0 control codes: 0x01?0x08, 0x0B?0x0C, 0x0E?0x1F
    * - DEL character: 0x7F
    */

    static const uint8_t map_loose[256] = {
    0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 70
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 80
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 90
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
    };

    /*
    * is_text_file(const char *path, const uint8_t map[256], double probability)
    *
    * Determines whether a file is likely a text file driven by a byte map
    * & probability threshold.
    *
    * Signature:
    *
    * path
    * Pathname of the file to examine. The function follows symlinks
    * & only operates on regular files.
    *
    * map[256]
    * A 256-entry lookup table indexed by byte value (0?255).
    * Each entry must be either:
    * 1 -> byte is considered valid text
    * 0 -> byte is considered invalid (binary indicator)
    *
    * probability
    * Maximum allowed percentage (0.0 to 100.0) of invalid bytes.
    * If the percentage of bytes for which map[byte] == 0 exceeds
    * this value, the file is classified as non-text.
    *
    * Behavior:
    * - Reads the file in binary mode.
    * - Counts total bytes & 'bad' bytes (map[] == 0).
    * - Returns 1 if bad byte percentage <= probability.
    * - Returns 0 if bad byte percentage > probability.
    * - Returns -1 on access/open errors or if the file is not regular.
    * - Function exits early if impossible to satisfy probability threshold.
    *
    * Returns:
    * 1 - text
    * 0 - binary indicator
    * -1 - could not open
    *
    * Example with 94.3% probability threshold:
    * is_text_file("./example.txt", your_map[256], 94.3)
    *
    */

    int is_text_file(const char *path, const uint8_t map[256], double probability) {

    struct stat st;
    if (stat(path, &st) != 0) return -1; // can not access file
    if (!S_ISREG(st.st_mode)) return -1; // reject: directories/devices/sockets
    FILE *f = fopen(path, "rb"); if (!f) return -1; // could not open file

    // 4KB: 4096, 8KB: 8192, 16KB: 16384, 32KB: 32768, 64KB: 65536
    unsigned char buf[16384];
    size_t n, i;
    size_t total = 0;
    size_t bad = 0;

    while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
    for (i = 0; i < n; i++) {
    total++;
    if (!map[buf[i]]) bad++;
    if (((double)bad / total) * 100.0 > probability) {
    fclose(f);
    return 0; // not text
    }
    }
    }

    fclose(f);

    if (total == 0) return 0; // empty file or total binary...

    // final probability check
    return (((double)bad / total) * 100.0 <= probability) ? 1 : 0;
    }



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Michael Sanders@3:633/10 to All on Wed Dec 17 03:19:56 2025
    On Tue, 16 Dec 2025 17:24:49 +0100, Bonita Montero wrote:

    How long have you been dealing with that ? 10 days ?
    I finished the AVX-512 version in C++ in two hours.

    As long as I want to.

    --
    :wq
    Mike Sanders

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lynn McGuire@3:633/10 to All on Wed Dec 17 00:52:51 2025
    On 12/11/2025 11:33 AM, Kaz Kylheku wrote:
    On 2025-12-06, Michael Sanders <porkchop@invalid.foo> wrote:
    Am I close? Missing anything you'd consider to be (or not) needed?

    Hi Michael,

    I contract for the the defense industry and badly need this function!

    I am working with proposed code like:

    if (is_binary_file(arg))
    launch_nuclear_strike();

    So I'm really sweating over the implementation, as you can imagine.

    This thread has been very helpful.

    I'm still leaning toward my paranoid functionw hich just checks that
    every bit of every byte is either 0 or 1 to confirm that the binary
    system is used.

    In the I/O error case, I will cautiously return a a true value; we would
    not want our side to lose due to a storage hardware issue.

    The probability of this function being correct may be less than 50%. I
    would hope that any military function would be much higher reliability
    than this.

    Lynn



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Wed Dec 17 07:57:42 2025
    Am 17.12.2025 um 04:19 schrieb Michael Sanders:
    On Tue, 16 Dec 2025 17:24:49 +0100, Bonita Montero wrote:

    How long have you been dealing with that ? 10 days ?
    I finished the AVX-512 version in C++ in two hours.
    As long as I want to.

    Here, about 11 times faster than your solution:

    bool asciiAllowedAvx( string_view buf, double prop )
    {
    ÿ ÿ char const
    ÿ ÿ ÿ ÿ *pBegin = buf.data(),
    ÿ ÿ ÿ ÿ *pEnd = pBegin + buf.size();
    ÿ ÿ size_t
    ÿ ÿ ÿ ÿ uBegin = (size_t)pBegin,
    ÿ ÿ ÿ ÿ uEnd = (size_t)pEnd,
    ÿ ÿ ÿ ÿ head = uBegin & 63,
    ÿ ÿ ÿ ÿ tail = uEnd & 63;
    ÿ ÿ __m512i
    ÿ ÿ ÿ ÿ *p64Begin = (__m512i *)(uBegin - head),
    ÿ ÿ ÿ ÿ *p64End = (__m512i *)((uEnd + 63) & -64);
    ÿ ÿ span<__m512i const> range( p64Begin, p64End );
    ÿ ÿ __m512i const
    ÿ ÿ ÿ ÿ prnt = _mm512_set1_epi8( (char)0x20 ),
    ÿ ÿ ÿ ÿ cr = _mm512_set1_epi8( (char)'\r' ),
    ÿ ÿ ÿ ÿ lf = _mm512_set1_epi8( (char)'\n' ),
    ÿ ÿ ÿ ÿ tab = _mm512_set1_epi8( (char)'\t' );
    ÿ ÿ uint64_t mask = (uint64_t)-1ll << head;
    ÿ ÿ ptrdiff_t nFits = 0;
    ÿ ÿ auto cur = range.begin(), end = range.end();
    ÿ ÿ auto doChunk = [&]()
    ÿ ÿ {
    ÿ ÿ ÿ ÿ __m512i chunk = _mm512_loadu_epi8( (void *)to_address( cur ) );
    ÿ ÿ ÿ ÿ uint64_t
    ÿ ÿ ÿ ÿ ÿ ÿ prntMask = _mm512_cmpge_epu8_mask( chunk, prnt ),
    ÿ ÿ ÿ ÿ ÿ ÿ crMask = _mm512_cmpeq_epi8_mask( chunk, cr ),
    ÿ ÿ ÿ ÿ ÿ ÿ lfMask = _mm512_cmpeq_epi8_mask( chunk, lf ),
    ÿ ÿ ÿ ÿ ÿ ÿ tabMask = _mm512_cmpeq_epi8_mask( chunk, tab );
    ÿ ÿ ÿ ÿ nFits += popcount( (prntMask | crMask | lfMask | tabMask) & mask );
    ÿ ÿ };
    ÿ ÿ for( ; cur != end - (bool)tail; ++cur, mask = -1ll )
    ÿ ÿ ÿ ÿ doChunk();
    ÿ ÿ if( tail )
    ÿ ÿ {
    ÿ ÿ ÿ ÿ mask &= ~((uint64_t)-1ll << tail);
    ÿ ÿ ÿ ÿ doChunk();
    ÿ ÿ }
    ÿ ÿ return 1.0 - (double)nFits / (double)(ptrdiff_t)buf.size() > prop;
    }


    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Michael Sanders@3:633/10 to All on Wed Dec 17 19:35:57 2025
    On Wed, 17 Dec 2025 07:57:42 +0100, Bonita Montero wrote:

    [...]

    Sigh, here we go...

    Everything in every thread I've read from you is faster, better,
    & you finished in two hours & yet here you are.

    *Very rich* Bontia. Thank you, but as others have pointed out:

    This is a C newsgroup, not a C++ newsgroup.

    static const char phrase[] = "I dont want or care about C++...";

    printf("%s\n", phrase);

    --
    :wq
    Mike Sanders

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Thu Dec 18 08:44:17 2025
    Am 17.12.2025 um 20:35 schrieb Michael Sanders:
    Sigh, here we go...
    Everything in every thread I've read from you is faster, better,
    & you finished in two hours & yet here you are.
    *Very rich* Bontia. Thank you, but as others have pointed out:
    This is a C newsgroup, not a C++ newsgroup.
    That's not so important because it's about the general principle.
    Writing it in C would only be slightly different.
    static const char phrase[] = "I dont want or care about C++..."; printf("%s\n", phrase);



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From bart@3:633/10 to All on Thu Dec 18 12:49:30 2025
    On 18/12/2025 07:44, Bonita Montero wrote:
    Am 17.12.2025 um 20:35 schrieb Michael Sanders:
    Sigh, here we go...
    Everything in every thread I've read from you is faster, better,
    & you finished in two hours & yet here you are.
    *Very rich* Bontia. Thank you, but as others have pointed out:
    This is a C newsgroup, not a C++ newsgroup.
    That's not so important because it's about the general principle.
    Writing it in C would only be slightly different.

    So, why not post C versions? Or, I guess it would be extended C.

    static const char phrase[] = "I dont want or care about C++...";
    printf("%s\n", phrase);




    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Thu Dec 18 14:06:40 2025
    Am 18.12.2025 um 13:49 schrieb bart:
    So, why not post C versions? Or, I guess it would be extended C.
    Becaue it's easier to write safe code in C++.
    F.e. I'm using a span of AVX2/AVX-512 words.
    While debugging I have bounds checking with that.

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Kenny McCormack@3:633/10 to All on Thu Dec 18 13:17:39 2025
    In article <10i0u79$aa6d$1@raubtier-asyl.eternal-september.org>,
    Bonita Montero <Bonita.Montero@gmail.com> wrote:
    Am 18.12.2025 um 13:49 schrieb bart:
    So, why not post C versions? Or, I guess it would be extended C.

    Becaue it's easier to write safe code in C++.

    Wouldn't it be easier still to just not post at all?

    --

    "If God wanted us to believe in him, he'd exist."

    (Linda Smith on "10 Funniest Londoners", TimeOut, 23rd June, 2005.)

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Thu Dec 18 16:03:24 2025
    Am 18.12.2025 um 14:17 schrieb Kenny McCormack:
    Becaue it's easier to write safe code in C++.
    Wouldn't it be easier still to just not post at all?
    Some people here thought they could develop efficient code for that.
    I just wanted to show that this is possible a lot faster.



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sat Dec 27 03:13:50 2025
    On Sun, 7 Dec 2025 19:01:02 +0000, Richard Harnden wrote:

    A text file is supposed to end with a '\n'

    PDF files end with that. The object index comes at the end, and each
    index entry is fixed in length and ends with \015\012.

    But the spec makes it very clear that PDF files are not supposed to be
    treated as text files.

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sat Dec 27 03:18:07 2025
    On Sun, 7 Dec 2025 03:43:40 -0700, Louis Krupp wrote:

    This brings back memories, most of them fond.

    Many former users of Burroughs systems seem to feel the same. ;)

    I have an unflattering story about John McCarthy, the father of Lisp,
    who was an IBM man who took over a computing centre where there was
    a Burroughs machine ...

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sat Dec 27 05:51:13 2025
    On Mon, 8 Dec 2025 13:51:49 +0100, Bonita Montero wrote:

    From the glibc Reference Manual:

    ?The distinction between text and binary streams is only meaningful
    on systems where text files have a different internal
    representation. On Unix systems, there is no difference between the
    two; the ?b? is accepted but ignored.?

    However, you need to distinguish the two if you want, like Python
    does, to be able to have a ?universal newline? mode, where you can
    correctly handle line breaks in files written on any of the three main
    platform families: *nix/Unix, Windows, and macOS.

    This is such a useful idea I?m surprised no one has suggested that C
    should offer the option.

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Paul@3:633/10 to All on Sat Dec 27 01:28:18 2025
    On Fri, 12/26/2025 10:13 PM, Lawrence D?Oliveiro wrote:
    On Sun, 7 Dec 2025 19:01:02 +0000, Richard Harnden wrote:

    A text file is supposed to end with a '\n'

    PDF files end with that. The object index comes at the end, and each
    index entry is fixed in length and ends with \015\012.

    But the spec makes it very clear that PDF files are not supposed to be treated as text files.


    The best you can do, is for the PDF to be entirely text except for
    some bytes near the top (second line). It's not exactly clear what they do,
    but I've seen at least one document that misses the binary line. That binary-thing could be a hash over the document.

    At least in this PDF, the document is 99% text. And Mutool can be
    used to convert a "mostly binary" PDF, into a "mostly text" PDF.

    If a PDF is encrypted, it is unlikely to have a textual representation
    when naively opening it.

    PDFs can be "anywhere from 99% binary to 99% text". It all depends.
    Generally, the ones that are mostly text are the simplest of documents.
    Rich media documents will have a lot more binary that cannot be
    simplified by simple transformations. You could start in the first place,
    by using different source materials that had closer-to-textual representation to fix that.

    ***********************************************************************************************************
    %PDF-1.4
    <=== these can "look like binary" "25 B8 9A 92 9D 0A"
    1 0 obj<</Type/Catalog/Pages 3 0 R>>
    endobj
    2 0 obj<</Producer(GemBox GemBox.Pdf 1.7 (17.0.35.1042; .NET Framework))/CreationDate(D:20211028151721+02'00')>>
    endobj
    3 0 obj<</Type/Pages/Kids[4 0 R]/Count 1/MediaBox[0 0 595.32 841.92]>>
    endobj
    4 0 obj<</Type/Page/Parent 3 0 R/Resources<</Font<</F0 6 0 R>>>>/Contents 5 0 R>>
    endobj
    5 0 obj<</Length 59>>stream
    BT
    /F0 12 Tf
    1 0 0 1 100 702.7366667 Tm
    (Hello World!)Tj
    ET
    endstream
    endobj
    6 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica/FirstChar 32/LastChar 114/Widths 7 0 R/FontDescriptor 8 0 R>>
    endobj
    7 0 obj[278 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 722 0 0 0 0 0 0 0 0 0 0 0 0 0 0 944 0 0 0 0 0 0 0 0 0 0 0 0 556 556 0 0 0 0 0 0 222 0 0 556 0 0 333]
    endobj
    8 0 obj<</Type/FontDescriptor/Flags 32/FontName/Helvetica/FontFamily(Helvetica)/FontWeight 500/ItalicAngle 0/FontBBox[-166 -225 1000 931]/CapHeight 718/XHeight 523/Ascent 718/Descent -207/StemH 76/StemV 88>>
    endobj
    xref
    0 9
    0000000000 65535 f
    0000000015 00000 n
    0000000059 00000 n
    0000000179 00000 n
    0000000257 00000 n
    0000000346 00000 n
    0000000451 00000 n
    0000000573 00000 n
    0000000773 00000 n
    trailer
    <</Root 1 0 R/ID[<9392A59F3BE7B840805D62746E8A4F29><9392A59F3BE7B840805D62746E8A4F29>]/Info 2 0 R/Size 9>>
    startxref
    988
    %%EOF ***********************************************************************************************************

    If "there has to be binary in it", it's on the second line.
    The other lines can be text... if the tools and print drivers
    wanted to do it that way.

    Paul

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sat Dec 27 21:27:21 2025
    On Sat, 27 Dec 2025 01:28:18 -0500, Paul wrote:

    The best you can do, is for the PDF to be entirely text except for
    some bytes near the top (second line). It's not exactly clear what
    they do ...

    The spec recommended the insertion of junk like that simply to
    dissuade file sniffers from concluding that the file is a text
    document.

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sun Dec 28 00:12:53 2025
    On Sat, 6 Dec 2025 03:14:55 -0500, Paul wrote:

    .. with CRLF, NEL line terminators

    Who uses NEL? Only IBM, as far as I know.

    Also, the only difference between XML 1.0 and XML 1.1 is that the
    latter adds NEL as a permitted line terminator.

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Richard Tobin@3:633/10 to All on Sun Dec 28 00:43:12 2025
    In article <10ipsm4$3ssi3$5@dont-email.me>,
    Lawrence D?Oliveiro <ldo@nz.invalid> wrote:

    Also, the only difference between XML 1.0 and XML 1.1 is that the
    latter adds NEL as a permitted line terminator.

    There are some differences concerning control characters too.

    -- Richard

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Sun Dec 28 02:49:19 2025
    On Tue, 9 Dec 2025 06:38:47 -0500, Paul wrote:

    I tested the "find.exe" in Cygwin64 and it did not finish. I used
    Process Monitor to see what it was doing, and there was a lot of
    registry activity. (There should not be registry activity by
    find.exe or file.exe )

    You?ve got the source code, you can see where that?s coming from.

    If it?s not coming from the Cygwin code, it?s something in Windows
    itself.

    I tried the file.exe command and it didn't provide output and the
    machine hung. My machine never hangs. It's a model citizen. Windows
    Defender did not trip. An offline scan with Windows Defender did not
    find anything.

    That sort of thing seems par for the course with Windows ...

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Mon Dec 29 16:06:50 2025
    Am 27.12.2025 um 06:51 schrieb Lawrence D?Oliveiro:
    On Mon, 8 Dec 2025 13:51:49 +0100, Bonita Montero wrote:

    From the glibc Reference Manual:

    ?The distinction between text and binary streams is only meaningful
    on systems where text files have a different internal
    representation. On Unix systems, there is no difference between the
    two; the ?b? is accepted but ignored.?
    However, you need to distinguish the two if you want, like Python
    does, to be able to have a ?universal newline? mode, where you can
    correctly handle line breaks in files written on any of the three main platform families: *nix/Unix, Windows, and macOS.
    No, MacOS, not macOS; the latter is "MacOS" since macOS X.
    This is such a useful idea I?m surprised no one has suggested that C
    should offer the option.



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From mjos_examine@3:633/10 to All on Mon Dec 29 11:49:01 2025
    On 2025-12-29 10:06 a.m., Bonita Montero wrote:
    However, you need to distinguish the two if you want, like Python
    does, to be able to have a ?universal newline? mode, where you can
    correctly handle line breaks in files written on any of the three main
    platform families: *nix/Unix, Windows, and macOS.
    No, MacOS, not macOS; the latter is "MacOS" since macOS X.

    Your assertion is contrary to that operating system vendor's own stance
    and branding.
    https://www.apple.com/os/macos/


    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Mon Dec 29 20:49:12 2025
    Am 29.12.2025 um 17:49 schrieb mjos_examine:
    Your assertion is contrary to that operating system vendor's own
    stance and branding.
    https://www.apple.com/os/macos

    There's nothing about the distinction between MacOS and macOS on this page.



    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Lawrence D?Oliveiro@3:633/10 to All on Tue Dec 30 01:52:07 2025
    On Mon, 29 Dec 2025 16:06:50 +0100, Bonita Montero wrote:

    Am 27.12.2025 um 06:51 schrieb Lawrence D?Oliveiro:

    On Mon, 8 Dec 2025 13:51:49 +0100, Bonita Montero wrote:

    From the glibc Reference Manual:

    ?The distinction between text and binary streams is only
    meaningful on systems where text files have a different internal
    representation. On Unix systems, there is no difference between
    the two; the ?b? is accepted but ignored.?

    However, you need to distinguish the two if you want, like Python
    does, to be able to have a ?universal newline? mode, where you can
    correctly handle line breaks in files written on any of the three main
    platform families: *nix/Unix, Windows, and macOS.

    This is such a useful idea I?m surprised no one has suggested that C
    should offer the option.

    Way to distract from my point!

    --- PyGate Linux v1.5.2
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)