c++ - utf8 character counting not working -
can please explain why 1st function works 2nd doesn't?
unsigned int utf8_count(char* in) { unsigned int = 0, c = 0; while (in[i]) { if ((in[i] & 0xc0) != 0x80) c++; i++; } return c; } unsigned int utf8_count(char* in, unsigned int in_size) { unsigned int = 0, c = 0; while (i < in_size) { if ((in[i] & 0xc0) != 0x80) c++; i++; } return c; } i understand (in[i] & 0xc0) != 0x80 don't understand why i < in_size != in[i]?
example string: ゴールデンタイムラバー/スキマスイッチ 57 bytes, 19 characters.
why utf8_count(in, 57) return 57 , not 19?
the binary representation of example string:

works fine here.. http://ideone.com/oepqg1
i tested in both codeblocks on windows 8 using g++ 4.8.1 , msvc 2013. tried on linux.. works. both print 19..
so whatever you're feeding not same string have in op..
// utf8test.cpp : defines entry point console application. // #include "stdafx.h" #include <iostream> #include <cstring> #include <clocale> int strlen_u8(const char* str) { int = 0, j = 0; while (str[i]) { if ((str[i] & 0xc0) != 0x80) { ++j; } ++i; } return j; } int strlen_s_u8(const char* str, unsigned int size) { unsigned int = 0, j = 0; while (i < size) { if ((str[i] & 0xc0) != 0x80) { ++j; } ++i; } return j; } #if defined _msc_ver || defined _win32 || defined _win64 int _tmain(int argc, _tchar* argv[]) #else int main(int argc, char* argv[]) #endif { #ifdef _msc_ver const char* str = "ゴールデンタイムラバー/スキマスイッチ"; #else const char* str = u8"ゴールデンタイムラバー/スキマスイッチ"; std::setlocale(lc_all, "ja_jp.utf-8"); #endif std::cout << strlen_u8(str) << "\n"; std::cout << strlen_s_u8(str, strlen(str)) << "\n"; //can use 57 instead of strlen. std::cin.get(); }
Comments
Post a Comment