2009年4月1日水曜日

文字コード変換

 やっぱ、お手軽に使える感じの文字コード変換が欲しい…。というわけで作りました。
wconv.hpp
#ifndef WCONV_HPP
#define WCONV_HPP

#include <string>

std::wstring string_to_unicode( const std::string& src, unsigned code_page );
std::string unicode_to_string( const std::wstring& src, unsigned code_page );

template <int CODEPAGE>
std::wstring string_to_unicode( const std::string& src ) {
  return string_to_unicode( src, CODEPAGE );
}

template <int CODEPAGE>
std::string unicode_to_string( const std::wstring& src ) {
  return unicode_to_string( src, CODEPAGE );
}

template <int DST_CODE_PAGE, int SRC_CODE_PAGE>
std::string code_to_code( const std::string& src ) {
  return unicode_to_string<DST_CODE_PAGE>( string_to_unicode<SRC_CODE_PAGE>( src ) );
}

#endif // WCONV_HPP


codepage.hpp
#ifndef CODE_PAGE_HPP
#define CODE_PAGE_HPP

/// form winnls.h
//@{
#define CP_ACP               0  // Current Code Page
#define CP_UTF7          65000  // UTF7
#define CP_UTF8          65001  // UTF8
//@}

#define CP_OEM_US          437
#define CP_OEM_ALABIC      720
#define CP_OEM_GREEK       737
#define CP_OEM_BALTIC      775
#define CP_OEM_MLATIN1     850
#define CP_OEM_LATIN2      852
#define CP_OEM_CYRILLIC    855
#define CP_OEM_TURKISH     857
#define CP_OEM_MLATIN1P    858
#define CP_OEM_HEBREW      862
#define CP_OEM_RUSSIAN     866

#define CP_THAI            874
#define CP_SJIS            932
#define CP_GBK             936
#define CP_KOREA           949
#define CP_BIG5            950

#define CP_EUROPE         1250
#define CP_CYRILLIC       1251
#define CP_LATIN1         1252
#define CP_GREEK          1253
#define CP_TURKISH        1254
#define CP_HEBREW         1255
#define CP_ARABIC         1256
#define CP_BALTIC         1257
#define CP_VIETNAM        1258

#define CP_ISO_LATIN1    28591
#define CP_ISO_LATIN2    28592
#define CP_ISO_LATIN3    28593
#define CP_ISO_BALTIC    28594
#define CP_ISO_CYRILLIC  28595
#define CP_ISO_ARABIC    28596
#define CP_ISO_HEBREW    28598
#define CP_ISO_TURKISH   28599
#define CP_ISO_LATIN9    28605

#endif // CODE_PAGE_HPP


wconv.cpp
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <winnls.h>
#include <string>
#include <boost/scoped_array.hpp>

#include "wconv.hpp"

std::wstring string_to_unicode( const std::string& src, unsigned code_page ) {
  int result_length = MultiByteToWideChar( code_page, 0, src.c_str(), -1, 0, 0 );
  if( result_length > 255 ) {
    boost::scoped_array<wchar_t> tbuff( new wchar_t[ result_length + 2 ] );
    MultiByteToWideChar( code_page, 0, src.c_str(), -1, tbuff.get(), result_length );
    std::wstring result = tbuff.get();
    return result;
  } else {
    wchar_t tbuff[ 256 ];
    MultiByteToWideChar( code_page, 0, src.c_str(), -1, tbuff, result_length );
    std::wstring result = tbuff;
    return result;
  }
}

std::string unicode_to_string( const std::wstring& src, unsigned code_page ) {
  int result_length = WideCharToMultiByte( code_page, 0, src.c_str(), -1, 0, 0, 0, 0 );
  if( result_length > 511 ) {
    boost::scoped_array<char> tbuff( new char[ result_length + 2 ] );
    WideCharToMultiByte( code_page, 0, src.c_str(), -1, tbuff.get(), result_length, 0, 0 );
    std::string result = tbuff.get();
    return result;
  } else {
    char tbuff[ 512 ];
    WideCharToMultiByte( code_page, 0, src.c_str(), -1, tbuff, result_length, 0, 0 );
    std::string result = tbuff;
    return result;
  }
}

//#define WCONV_TEST_APP
#ifdef WCONV_TEST_APP

#include "codepage.hpp"
#include <fstream>
#include <iostream>

//#define YOURPAGE  CP_SJIS
#define YOURPAGE  CP_ACP

int main(int argc, char* argv[]) {
  if( argc < 3 )  return 1;
  std::ifstream ifs( argv[1] );
  std::ofstream ofs( argv[2] );
  //std::wcout.imbue( std::locale("") );
  std::string str;
  while( std::getline( ifs, str ) ) {
    std::string dst = code_to_code<CP_UTF8,YOURPAGE>( str );
    ofs << dst << std::endl;
    //ofs << code_to_code<YOURPAGE,CP_UTF8>( str ) << std::endl;
  }
  return 0;
}

#endif
2018/08/29 追記 std::basic_string のバッファを直接使っていたが、それだと null terminated まで組み込まれてしまうため、修正を行った。

0 件のコメント: