define the probability? Say in 0...1?
On Fri, 12 Dec 2025 15:33:01 -0800, Chris M. Thomasson wrote:
define the probability? Say in 0...1?Probabilities for Chris... Enjoy.
#include <stdio.h>
#include <stdint.h>
#include <sys/stat.h>
/*
* map_strict[]
*
* Valid bytes:
* - ASCII printable: 0x20?0x7E
* - ISO-8859-1 high printable: 0xA0?0xFF
* - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
*
* Invalid bytes (binary indicators):
* - NULL byte (0x00)
* - C0 controls (0x01?0x08, 0x0B?0x0C, 0x0E?0x1F)
* - DEL (0x7F)
* - C1 controls (0x80?0x9F)
*/
static const uint8_t map_strict[256] = {
0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, // 70
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 80
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 90
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
};
/*
* map_loose[]
*
* Valid bytes:
* - ASCII printable characters: 0x20?0x7E
* - Whitespace: TAB (0x09), LF (0x0A), CR (0x0D)
* - High bytes: 0x80?0xFF
*
* Invalid bytes (binary indicators):
* - NULL byte: 0x00
* - C0 control codes: 0x01?0x08, 0x0B?0x0C, 0x0E?0x1F
* - DEL character: 0x7F
*/
static const uint8_t map_loose[256] = {
0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 70
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 80
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 90
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0
};
/*
* is_text_file(const char *path, const uint8_t map[256], double probability)
*
* Determines whether a file is likely a text file driven by a byte map
* & probability threshold.
*
* Signature:
*
* path
* Pathname of the file to examine. The function follows symlinks
* & only operates on regular files.
*
* map[256]
* A 256-entry lookup table indexed by byte value (0?255).
* Each entry must be either:
* 1 -> byte is considered valid text
* 0 -> byte is considered invalid (binary indicator)
*
* probability
* Maximum allowed percentage (0.0 to 100.0) of invalid bytes.
* If the percentage of bytes for which map[byte] == 0 exceeds
* this value, the file is classified as non-text.
*
* Behavior:
* - Reads the file in binary mode.
* - Counts total bytes & 'bad' bytes (map[] == 0).
* - Returns 1 if bad byte percentage <= probability.
* - Returns 0 if bad byte percentage > probability.
* - Returns -1 on access/open errors or if the file is not regular.
* - Function exits early if impossible to satisfy probability threshold.
*
* Returns:
* 1 - text
* 0 - binary indicator
* -1 - could not open
*
* Example with 94.3% probability threshold:
* is_text_file("./example.txt", your_map[256], 94.3)
*
*/
int is_text_file(const char *path, const uint8_t map[256], double probability) {
struct stat st;
if (stat(path, &st) != 0) return -1; // can not access file
if (!S_ISREG(st.st_mode)) return -1; // reject: directories/devices/sockets
FILE *f = fopen(path, "rb"); if (!f) return -1; // could not open file
// 4KB: 4096, 8KB: 8192, 16KB: 16384, 32KB: 32768, 64KB: 65536
unsigned char buf[16384];
size_t n, i;
size_t total = 0;
size_t bad = 0;
while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
for (i = 0; i < n; i++) {
total++;
if (!map[buf[i]]) bad++;
if (((double)bad / total) * 100.0 > probability) {
fclose(f);
return 0; // not text
}
}
}
fclose(f);
if (total == 0) return 0; // empty file or total binary...
// final probability check
return (((double)bad / total) * 100.0 <= probability) ? 1 : 0;
}
How long have you been dealing with that ? 10 days ?
I finished the AVX-512 version in C++ in two hours.
On 2025-12-06, Michael Sanders <porkchop@invalid.foo> wrote:
Am I close? Missing anything you'd consider to be (or not) needed?
Hi Michael,
I contract for the the defense industry and badly need this function!
I am working with proposed code like:
if (is_binary_file(arg))
launch_nuclear_strike();
So I'm really sweating over the implementation, as you can imagine.
This thread has been very helpful.
I'm still leaning toward my paranoid functionw hich just checks that
every bit of every byte is either 0 or 1 to confirm that the binary
system is used.
In the I/O error case, I will cautiously return a a true value; we would
not want our side to lose due to a storage hardware issue.
On Tue, 16 Dec 2025 17:24:49 +0100, Bonita Montero wrote:
How long have you been dealing with that ? 10 days ?As long as I want to.
I finished the AVX-512 version in C++ in two hours.
[...]
Sigh, here we go...That's not so important because it's about the general principle.
Everything in every thread I've read from you is faster, better,
& you finished in two hours & yet here you are.
*Very rich* Bontia. Thank you, but as others have pointed out:
This is a C newsgroup, not a C++ newsgroup.
static const char phrase[] = "I dont want or care about C++..."; printf("%s\n", phrase);
Am 17.12.2025 um 20:35 schrieb Michael Sanders:
Sigh, here we go...That's not so important because it's about the general principle.
Everything in every thread I've read from you is faster, better,
& you finished in two hours & yet here you are.
*Very rich* Bontia. Thank you, but as others have pointed out:
This is a C newsgroup, not a C++ newsgroup.
Writing it in C would only be slightly different.
static const char phrase[] = "I dont want or care about C++...";
printf("%s\n", phrase);
So, why not post C versions? Or, I guess it would be extended C.Becaue it's easier to write safe code in C++.
Am 18.12.2025 um 13:49 schrieb bart:
So, why not post C versions? Or, I guess it would be extended C.
Becaue it's easier to write safe code in C++.
Some people here thought they could develop efficient code for that.Becaue it's easier to write safe code in C++.Wouldn't it be easier still to just not post at all?
A text file is supposed to end with a '\n'
This brings back memories, most of them fond.
From the glibc Reference Manual:
?The distinction between text and binary streams is only meaningful
on systems where text files have a different internal
representation. On Unix systems, there is no difference between the
two; the ?b? is accepted but ignored.?
On Sun, 7 Dec 2025 19:01:02 +0000, Richard Harnden wrote:
A text file is supposed to end with a '\n'
PDF files end with that. The object index comes at the end, and each
index entry is fixed in length and ends with \015\012.
But the spec makes it very clear that PDF files are not supposed to be treated as text files.
The best you can do, is for the PDF to be entirely text except for
some bytes near the top (second line). It's not exactly clear what
they do ...
.. with CRLF, NEL line terminators
Also, the only difference between XML 1.0 and XML 1.1 is that the
latter adds NEL as a permitted line terminator.
I tested the "find.exe" in Cygwin64 and it did not finish. I used
Process Monitor to see what it was doing, and there was a lot of
registry activity. (There should not be registry activity by
find.exe or file.exe )
I tried the file.exe command and it didn't provide output and the
machine hung. My machine never hangs. It's a model citizen. Windows
Defender did not trip. An offline scan with Windows Defender did not
find anything.
On Mon, 8 Dec 2025 13:51:49 +0100, Bonita Montero wrote:No, MacOS, not macOS; the latter is "MacOS" since macOS X.
From the glibc Reference Manual:However, you need to distinguish the two if you want, like Python
?The distinction between text and binary streams is only meaningful
on systems where text files have a different internal
representation. On Unix systems, there is no difference between the
two; the ?b? is accepted but ignored.?
does, to be able to have a ?universal newline? mode, where you can
correctly handle line breaks in files written on any of the three main platform families: *nix/Unix, Windows, and macOS.
This is such a useful idea I?m surprised no one has suggested that C
should offer the option.
However, you need to distinguish the two if you want, like PythonNo, MacOS, not macOS; the latter is "MacOS" since macOS X.
does, to be able to have a ?universal newline? mode, where you can
correctly handle line breaks in files written on any of the three main
platform families: *nix/Unix, Windows, and macOS.
Your assertion is contrary to that operating system vendor's own
stance and branding.
https://www.apple.com/os/macos
Am 27.12.2025 um 06:51 schrieb Lawrence D?Oliveiro:
On Mon, 8 Dec 2025 13:51:49 +0100, Bonita Montero wrote:
From the glibc Reference Manual:
?The distinction between text and binary streams is only
meaningful on systems where text files have a different internal
representation. On Unix systems, there is no difference between
the two; the ?b? is accepted but ignored.?
However, you need to distinguish the two if you want, like Python
does, to be able to have a ?universal newline? mode, where you can
correctly handle line breaks in files written on any of the three main
platform families: *nix/Unix, Windows, and macOS.
This is such a useful idea I?m surprised no one has suggested that C
should offer the option.
| Sysop: | Tetrazocine |
|---|---|
| Location: | Melbourne, VIC, Australia |
| Users: | 15 |
| Nodes: | 8 (0 / 8) |
| Uptime: | 108:57:06 |
| Calls: | 188 |
| Files: | 21,502 |
| Messages: | 82,320 |