externcrateregex;useregex::Regex;fnto_unicode_string(raw_string:&str)->String{raw_string.chars().map(|c|{if'\u{4e00}'<=c&&c<='\u{9fff}'{format!("#U{:04x}",casu32)}else{c.to_string()}}).collect()}fnsplit_with_unicode(input:&str)->Vec<&str>{letre=Regex::new(r"#U[0-9a-fA-F]{4}").unwrap();letmutresult=Vec::new();letmutlast_end=0;formatinre.find_iter(input){// Add the portion before the current match to the resultifmat.start()>last_end{result.push(&input[last_end..mat.start()]);}// Add the matched pattern to the resultresult.push(&input[mat.start()..mat.end()]);last_end=mat.end();}// Add the remaining portion of the string, if anyiflast_end<input.len(){result.push(&input[last_end..]);}returnresult}fnfrom_unicode_string(unicode_string:&str)->String{println!("{}",unicode_string);split_with_unicode(unicode_string).iter().map(|&c|{ifc.starts_with("#U"){char::from_u32(u32::from_str_radix(&c[2..],16).unwrap()).unwrap().to_string()}else{c.to_string()}}).collect()}fntest_func(s:&str){letres=to_unicode_string(s);println!("{}",res);println!("{}",from_unicode_string(&res));}fnmain(){lets="1801万里长城永不倒_123445_5_3811-3826s_000004.jpg";println!("{}",s);test_func(s);}// 输出结果:// 1801万里长城永不倒_123445_5_3811-3826s_000004.jpg// 1801#U4e07#U91cc#U957f#U57ce#U6c38#U4e0d#U5012_123445_5_3811-3826s_000004.jpg// 1801#U4e07#U91cc#U957f#U57ce#U6c38#U4e0d#U5012_123445_5_3811-3826s_000004.jpg// 1801万里长城永不倒_123445_5_3811-3826s_000004.jpg
#include<iostream>#include<string>#include<regex>#include<sstream>#include<iomanip>#include<vector>// Helper function to decode UTF-8 to a single Unicode code pointuint32_tutf8_to_codepoint(constchar*&it,constchar*end){uint32_tcodepoint=0;unsignedcharch=*it;if(ch<0x80){codepoint=ch;}elseif((ch&0xE0)==0xC0){codepoint=ch&0x1F;}elseif((ch&0xF0)==0xE0){codepoint=ch&0x0F;}elseif((ch&0xF8)==0xF0){codepoint=ch&0x07;}++it;while(it!=end&&(*it&0xC0)==0x80){codepoint=(codepoint<<6)|(*it&0x3F);++it;}returncodepoint;}std::stringto_unicode_string(conststd::string&raw_string){std::ostringstreamoss;constchar*it=raw_string.c_str();constchar*end=it+raw_string.size();while(it<end){constchar*start=it;uint32_tcodepoint=utf8_to_codepoint(it,end);if(codepoint>=0x4E00&&codepoint<=0x9FFF){oss<<"#U"<<std::hex<<std::setw(4)<<std::setfill('0')<<codepoint;}else{oss.write(start,it-start);}}returnoss.str();}std::vector<std::string>split_with_unicode(conststd::string&s){std::regexre(R"((#[Uu][0-9a-fA-F]{4}))");std::sregex_token_iteratoriter(s.begin(),s.end(),re,{-1,0});std::sregex_token_iteratorend;std::vector<std::string>result(iter,end);returnresult;}std::stringfrom_unicode_string(conststd::string&unicode_string){std::ostringstreamoss;autoparts=split_with_unicode(unicode_string);for(constauto&part:parts){if(part.starts_with("#U")||part.starts_with("#u")){intcode;std::istringstream(part.substr(2))>>std::hex>>code;if(code<0x80){oss<<static_cast<char>(code);}else{// Encode code into UTF-8if(code<0x800){oss<<static_cast<char>((code>>6)|0xC0);oss<<static_cast<char>((code&0x3F)|0x80);}elseif(code<0x10000){oss<<static_cast<char>((code>>12)|0xE0);oss<<static_cast<char>(((code>>6)&0x3F)|0x80);oss<<static_cast<char>((code&0x3F)|0x80);}}}else{oss<<part;}}returnoss.str();}voidtest_func(conststd::string&s){autores=to_unicode_string(s);std::cout<<res<<std::endl;std::cout<<from_unicode_string(res)<<std::endl;}intmain(){std::strings="1801万里ggg长城永不倒_123445_5_3811-3826s_000004.jpg";test_func(s);return0;}
#include<iostream>#include<string>#include<sstream>#include<iomanip>#include<vector>#include<regex>#include<codecvt>// Function to convert a wide string to a Unicode representationstd::wstringto_unicode_wstring(conststd::wstring&raw_wstring){std::wostringstreamwoss;for(wchar_twc:raw_wstring){if(wc>=0x4E00&&wc<=0x9FFF){woss<<L"#U"<<std::hex<<std::setw(4)<<std::setfill(L'0')<<static_cast<int>(wc);woss<<std::dec;}else{woss<<wc;}}returnwoss.str();}// Helper function to split a wide string based on Unicode markersstd::vector<std::wstring>split_with_unicode(conststd::wstring&s){std::wregexre(L"(#[Uu][0-9a-fA-F]{4})");std::wsregex_token_iteratoriter(s.begin(),s.end(),re,{-1,0});std::wsregex_token_iteratorend;std::vector<std::wstring>result(iter,end);for(autoitem:result){std::wcout<<"split : "<<item<<std::endl;}returnresult;}// Function to convert a Unicode representation back to a wide stringstd::wstringfrom_unicode_wstring(conststd::wstring&unicode_wstring){std::wostringstreamwoss;autoparts=split_with_unicode(unicode_wstring);for(constauto&part:parts){if(part.starts_with(L"#U")||part.starts_with(L"#u")){intcode;std::wistringstream(part.substr(2))>>std::hex>>code;woss<<static_cast<wchar_t>(code);}else{woss<<part;}}returnwoss.str();}// Function to test the conversion processvoidtest_func(conststd::wstring&s){autores=to_unicode_wstring(s);// const std::locale utf8( std::locale(), new std::codecvt_utf8<wchar_t> );// std::wcout.imbue(utf8);std::wcout<<L"Encoded: "<<res<<std::endl;// Encoded: 1801#U4,e07#U9,1cc#U9,57f#U5,7ce#U6,c38#U4,e0d#U5,012_123445_5_3811-3826s_000004.jpg???// Convert wstring to string for display (requires a codecvt facet)// std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;// std::string utf8_string = converter.to_bytes(res);// Output the converted UTF-8 string// std::cout << "UTF-8 String: " << utf8_string << std::endl;res=L"1801#U4e07#U91cc#U957f#U57ce#U6c38#U4e0d#U5012_123445_5_3811-3826s_000004.jpg";std::wcout<<L"Decoded: "<<from_unicode_wstring(res)<<std::endl;}intmain(){std::locale::global(std::locale(""));// Set the locale to support wide charactersstd::wstrings=L"1801万里长城永不倒_123445_5_3811-3826s_000004.jpg";test_func(s);return0;}
#include<iostream>#include<string>#include<locale>#include<codecvt>// Convert std::string to std::wstringstd::wstringstring_to_wstring(conststd::string&str){std::wstring_convert<std::codecvt_utf8<wchar_t>>converter;returnconverter.from_bytes(str);}// Convert std::wstring to std::stringstd::stringwstring_to_string(conststd::wstring&wstr){std::wstring_convert<std::codecvt_utf8<wchar_t>>converter;returnconverter.to_bytes(wstr);}intmain(){std::stringnarrow="Hello, world!";std::wstringwide=string_to_wstring(narrow);std::wcout<<L"Converted to wstring: "<<wide<<std::endl;std::wstringwstr=L"你好,世界!";std::stringstr=wstring_to_string(wstr);std::cout<<"Converted to string: "<<str<<std::endl;return0;}
The underlying C streams have a concept of being either wide-oriented or narrow-oriented. You can’t use wide output functions on a narrow stream and vice-versa.
#include<locale>#include<iostream>intmain(){// 创建一个locale对象std::localeloc("C.UTF-8");// 将locale应用于流std::cout.imbue(loc);// 获取当前全局localestd::localecurrentLocale=std::locale();// 使用locale进行字符转换charc='a';if(std::isalpha(c,loc)){std::cout<<c<<" is alphabetic in the given locale."<<std::endl;}}