The problem is a combination of your BUFFER_SIZE
, chinese_test
file chinese_test
and wchar_t
byte alignment. As evidence, try drastically increasing BUFFER_SIZE
- you should start getting the answer you want.
What happens is that your program works for the first block of text that it receives. But think about what happens in your code if a character is split between the first and second blocks as follows:
| First Block | Second Block | | [wchar_t] [wchar_t] ... [wchar_t] [wchar_t] ... | | [1,2,3,4] [1,2,3,4] ... [1,2,3,4] [1,2,3,4] ... |
Your code will start the second block by the 3rd byte in the first character, and this will not be recognized as valid. Since mbtowc
will return -1
when it does not find a valid character, your loop will end immediately and will count the zero characters for this entire block. The same applies to the following blocks.
EDIT:
Another problem that I noticed is that for mbtowc
to work mbtowc
you need to set the locale. Given all these issues, I wrote the following, which returns the same character for me as wc
:
#include <stdlib.h> #include <stdio.h> #include <locale.h> int BUFFER_SIZE = 1024; const char *DEFAULT_F_IN = "chinese_test"; struct counts { int bytes; int chars; }; int count_block(struct counts *c, char *buf, int buf_size) { int offset = 0; while (offset < buf_size) { int n = mbtowc(NULL, buf + offset, MB_CUR_MAX); if (n <= 0) { break; } offset += n; c->bytes += n; c->chars++; } return buf_size - offset; } void get_counts(struct counts *c, FILE *fd) { char buf[BUFFER_SIZE]; c->bytes = 0; c->chars = 0; int bytes_read; while((bytes_read = fread(buf, sizeof(*buf), BUFFER_SIZE, fd)) > 0) { int remaining = count_block(c, buf, bytes_read); if (remaining == 0) { continue; } else if (remaining < MB_CUR_MAX) { fseek(fd, -remaining, SEEK_CUR); } else { perror("Error"); exit(1); } } } int main(int argc, char *argv[]) { FILE *fd; if (argc > 1) { fd = fopen(argv[1], "rb"); } else { fd = fopen(DEFAULT_F_IN, "rb"); } setlocale(LC_ALL, ""); struct counts c; get_counts(&c, fd); printf("chars: %d\tbytes: %d\n", c.chars, c.bytes); return 0; }