opencc全称open chinese convert,是一个github上面的开源项目,主要用于简繁体汉字的转换,支持语义级别的翻译。c/c++开发分享OpenCC的编译与多语言使用就来简单介绍一下该库的编译以及python、c++和java分别如何调用dll进行转换。并记录一些使用过程中踩过的坑。



cmake command line


cmake -g “visual studio 14 2015″ -d cmake_install_prefix=”d:/projects/cnblogs/alpha panda/opencc” ../opencc-ver.1.0.5


cmake –build ./ –config relwithdebinfo –target install


cmake – gui

下载最新版本cmake,配置工程代码generator,c/c++开发分享OpenCC的编译与多语言使用使用的visual studio 14 2015。



  • unix:/usr/local
  • windows:c:/program files/${project_name}

这里设置为:d:/projects/cnblogs/alpha panda/opencc


visual studio

使用cmake command line或者cmake-gui得到vs工程文件。


工程libopencc的属性配置寻找一个宏变量:pkgdatadir(pkgdatadir=”d:/projects/cnblogs/alpha panda/opencc/share//opencc/”)


cmake_install_prefix = d:/projects/cnblogs/alpha panda/opencc  set (dir_prefix ${cmake_install_prefix})  set (dir_share ${dir_prefix}/share/)  set (dir_share_opencc ${dir_share}/opencc/)  -dpkgdatadir="${dir_share_opencc}"



利用上面编译得到的libopencc的dll文件,通过python调用来进行字体的转换:(下面的代码改编自 opencc 0.2)

# -*- coding:utf-8 -*-

import os import sys from ctypes.util import find_library from ctypes import cdll, cast, c_char_p, c_size_t, c_void_p __all__ = ['configs', 'convert'] if sys.version_info[0] == 3: text_type = str else: text_type = unicode _libcfile = find_library('c') or 'libc.so.6' libc = cdll(_libcfile, use_errno=true) _libopenccfile = os.getenv('libopencc') or find_library('opencc') if _libopenccfile: libopencc = cdll(_libopenccfile, use_errno=true) else: #libopencc = cdll('libopencc.so.1', use_errno=true) # _libopenccfile = find_library(r'g:openccbuildsrcreleaseopencc') # 貌似不能使用相对路径? cur_dir = os.getcwd() lib_path = os.path.join(cur_dir, 't2s_translation_lib', 'opencc') lib_path = './share/opencc' libopencc = cdll(lib_path, use_errno=true) libc.free.argtypes = [c_void_p] libopencc.opencc_open.restype = c_void_p libopencc.opencc_convert_utf8.argtypes = [c_void_p, c_char_p, c_size_t] libopencc.opencc_convert_utf8.restype = c_void_p libopencc.opencc_close.argtypes = [c_void_p]
libopencc.opencc_convert_utf8_free.argstypes = c_char_p configs = [ 'hk2s.json', 's2hk.json', 's2t.json', 's2tw.json', 's2twp.json', 't2s.json', 'tw2s.json', 'tw2sp.json', 't2tw.json', 't2hk.json', ] class opencc(object): def __init__(self, config='t2s.json'): self._od = libopencc.opencc_open(c_char_p(config.encode('utf-8'))) def convert(self, text): if isinstance(text, text_type): # use bytes text = text.encode('utf-8') retv_i = libopencc.opencc_convert_utf8(self._od, text, len(text)) if retv_i == -1: raise exception('opencc convert error') retv_c = cast(retv_i, c_char_p) value = retv_c.value # 此处有问题? # libc.free(retv_c) libopencc.opencc_convert_utf8_free(retv_i)
return value def __del__(self): libopencc.opencc_close(self._od) def convert(text, config='t2s.json'): cc = opencc(config) return cc.convert(text)


关于python如何调用dll文件,可以参考我的另一篇文章:python使用ctypes与c/c++ dll文件通信过程介绍及实例分析


origin_text = u'(理发 vs 发财),(闹钟 vs 一见钟情),后来'.encode('utf-8')  s2t_1 = convert(origin_text, 's2t.json')  t2s_1 = convert(s2t_1, 't2s.json')  print t2s_1.decode('utf-8')  print s2t_1.decode('utf-8')  print origin_text == t2s_1  ============================================  >>>(理发 vs 发财),(闹钟 vs 一见钟情),后来  >>>(理髮 vs 發財),(鬧鐘 vs 一見鍾情),後來  >>>true




string gbktoutf8(const char* strgbk)  {      int len = multibytetowidechar(cp_acp, 0, strgbk, -1, null, 0);      wchar_t* wstr = new wchar_t[len + 1];      memset(wstr, 0, len + 1);      multibytetowidechar(cp_acp, 0, strgbk, -1, wstr, len);      len = widechartomultibyte(cp_utf8, 0, wstr, -1, null, 0, null, null);      char* str = new char[len + 1];      memset(str, 0, len + 1);      widechartomultibyte(cp_utf8, 0, wstr, -1, str, len, null, null);      string strtemp = str;      if (wstr) delete[] wstr;      if (str) delete[] str;      return strtemp;  }    string utf8togbk(const char* strutf8)  {      int len = multibytetowidechar(cp_utf8, 0, strutf8, -1, null, 0);      wchar_t* wszgbk = new wchar_t[len + 1];      memset(wszgbk, 0, len * 2 + 2);      multibytetowidechar(cp_utf8, 0, strutf8, -1, wszgbk, len);      len = widechartomultibyte(cp_acp, 0, wszgbk, -1, null, 0, null, null);      char* szgbk = new char[len + 1];      memset(szgbk, 0, len + 1);      widechartomultibyte(cp_acp, 0, wszgbk, -1, szgbk, len, null, null);      string strtemp(szgbk);      if (wszgbk) delete[] wszgbk;      if (szgbk) delete[] szgbk;      return strtemp;  }



#include <cstdio>
#include <cstdlib>
#include <iostream> #include <string> #include <windows.h> #include <fstream> #include "../opencc-ver.1.0.5/src/opencc.h" //using namespace std; using std::cout; using std::endl; using std::string; #define opencc_api_export __declspec(dllimport) opencc_api_export char* opencc_convert_utf8(opencc_t opencc, const char* input, size_t length); opencc_api_export int opencc_close(opencc_t opencc); opencc_api_export opencc_t opencc_open(const char* configfilename);
opencc_api_export void opencc_convert_utf8_free(char* str); #pragma comment(lib, "../build/src/relwithdebinfo/opencc.lib")
string gbktoutf8(const char* strgbk); string utf8togbk(const char* strutf8); int main() { char* trans_conf = "s2t.json"; char* trans_res = nullptr; string gbk_str, utf8_str, res; // read from file and write translation results to file std::ifstream infile; std::ofstream outfile; infile.open("infile.txt", std::ifstream::in); outfile.open("outfile.txt", std::ifstream::out); // open the config file opencc_t conf_file = opencc_open(trans_conf); while (infile.good()) { infile >> gbk_str; utf8_str = gbktoutf8(gbk_str.c_str()); std::cout << gbk_str << "n"; trans_res = opencc_convert_utf8(conf_file, utf8_str.c_str(), utf8_str.length()); cout << utf8togbk(trans_res) << endl; outfile << trans_res << endl;
opencc_convert_utf8_free(trans_res); // delete[] trans_res; trans_res = nullptr; } infile.close(); outfile.close(); opencc_close(conf_file); conf_file = nullptr; system("pause"); return 0; }




package com.tvjody;    import java.io.unsupportedencodingexception;  import java.io.writer;  import java.nio.charset.standardcharsets;    import com.sun.jna.library;  import com.sun.jna.native;  import com.sun.jna.platform;  import com.sun.jna.pointer;    import java.io.bufferedreader;  import java.io.fileinputstream;  import java.io.filenotfoundexception;  import java.io.ioexception;  import java.io.inputstreamreader;  import java.io.outputstreamwriter;  import java.io.printwriter;  import java.io.reader;  import java.io.fileoutputstream;    public class jna_call {        public interface openccdll extends library{          openccdll instance = (openccdll) native.load(                  (platform.iswindows() ? "opencc" : "libc.so.6"),                  openccdll.class);            //        void* opencc_open(const char* configfilename);          pointer opencc_open(string configfilename);            //        int opencc_close(void* opencc);          int opencc_close(pointer opencc);            //        void opencc_convert_utf8_free(char* str);          void opencc_convert_utf8_free(string str);            //        char* opencc_convert_utf8(opencc_t opencc, const char* input, size_t length)          string opencc_convert_utf8(pointer opencc, string input, int length);      }            public static void writetofile(string utf8_str) throws ioexception {          writer out = new outputstreamwriter(new fileoutputstream("out.txt"), standardcharsets.utf_8);          out.write(utf8_str);          out.close();      }            public static string readfromfile() throws ioexception {          string res = "";          reader in = new inputstreamreader(new fileinputstream("in.txt"), standardcharsets.utf_8);          try(bufferedreader read_buf = new bufferedreader(in)){              string line;              while((line = read_buf.readline()) != null) {                  res += line;              }              read_buf.close();          }          return res;      }        public static void main(string[] args) throws unsupportedencodingexception, filenotfoundexception {          system.setproperty("jna.library.path", "d:\projects\open_source\opwncc\build(x64)\src\relwithdebinfo");          pointer conf_file = openccdll.instance.opencc_open("s2t.json");          try {              string res_utf8 = readfromfile();              system.out.println("from: " + res_utf8);              byte[] ptext = res_utf8.getbytes("utf-8");  //            string utf8_str = new string(res_utf8.getbytes("gbk"), "utf-8");              string trans_res = openccdll.instance.opencc_convert_utf8(conf_file, res_utf8, ptext.length);              system.out.println("to:" + trans_res);  //            string trans_gbk = new string(trans_res.getbytes("utf-8"), "gbk");              writetofile(trans_res);              openccdll.instance.opencc_convert_utf8_free(trans_res);          } catch (ioexception e) {              // todo auto-generated catch block              e.printstacktrace();          }          openccdll.instance.opencc_close(conf_file);      }  }









void convertdictionary(const string inputfilename, const string outputfilename, const string formatfrom, const string formatto);  opencc_export void convertdictionary(const string inputfilename, const string outputfilename, const string formatfrom, const string formatto);




java.lang.unsatisfiedlinkerror: %1 不是有效的 win32 应用程序


使用cmake-gui configure直接指定64位的编译器,选择visual studio 14 2015 win64,而不是visual studio 14 2015。

如果当前的工程为32位的工程,可以在vs中通过configuration manager来手动配置为x64位。将32位工程手动改为64位工程可能会有许多的坑,比如:

fatal error lnk1112: module machine type ‘x64’ conflicts with target machine type ‘x86’


  1. check your properties options in your linker settings at: properties > configuration properties > linker > advanced > target machine. select machinex64 if you are targeting a 64 bit build, or machinex86 if you are making a 32 bit build.
  2. select build > configuration manager from the main menu in visual studio. make sure your project has the correct platform specified. it is possible for the ide to be set to build x64 but an individual project in the solution can be set to target win32. so yeah, visual studio leaves a lot of rope to hang yourself, but that’s life.
  3. check your library files that they really are of the type of platform are targeting. this can be used by using dumpbin.exe which is in your visual studio vcbin directory. use the -headers option to dump all your functions. look for the machine entry for each function. it should include x64 if it’s a 64 bit build.
  4. in visual studio, select tools > options from the main menu. select projects and solutions > vc++ directories. select x64 from the platform dropdown. make sure that the first entry is: $(vcinstalldir)binx86_amd64 followed by $(vcinstalldir)bin.
  5. check in visual studio:project properties -> configuration properties -> linker -> command line.”additional options” should not contain /machine:x86.i have such key, generated by cmake output: cmake generated x86 project, then i added x64 platform via configuration manager in visual studio 2010 – everything was create fine for new platform except linker command line, specified /machine:x86 separately.






1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(32): error c3688: invalid literal suffix '銆'; literal operator or literal operator template 'operator ""銆' not found  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(32): error c3688: invalid literal suffix '锛'; literal operator or literal operator template 'operator ""锛' not found  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(32): error c3688: invalid literal suffix '鈥'; literal operator or literal operator template 'operator ""鈥' not found  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(32): error c2001: newline in constant  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(33): error c3688: invalid literal suffix '鈥'; literal operator or literal operator template 'operator ""鈥' not found  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(33): error c3688: invalid literal suffix '锛'; literal operator or literal operator template 'operator ""锛' not found  1>d:projectsopen_sourceopwnccopencc-ver.1.0.5srcphraseextract.cpp(33): error c3688: invalid literal suffix '銆'; literal operator or literal operator template 'operator ""銆' not found

view code

文本编码对应关系(visual studio 2015 vs notepad++):
file->advance save options:

chinese simplified (gb2312) - codepage 936 <==> gbk  unicode (utf-8 with signature) - codepage 65001 <==> encoding in utf-8 bom  unicode (utf-8 without signature) - codepage 65001 <==> encoding in utf-8

将上面文件的编码方式从unicode (utf-8 without signature) – codepage 65001改为 chinese simplified (gb2312) – codepage 936即可。

python的编码转换比较简单,c++的转换接口上面已经列出,至于java,建议将java文件和数据文件的编码方式均改为utf-8,使用string utf8_str = new string(gbk_str.getbytes(“utf-8”), “utf-8”)这种转码方式可能带来一些奇怪的问题。




上面的c++代码在exe中delete dll分配的空间,是一种未定义行为。


实际使用尤其是使用不同语言对opencc.dll进行调用的时候会碰到很多问题,这时最好的办法就是使用vs的attach to process对dll进行断点跟进。

对于python调用dll,可以先打开一个python shell或者idle环境并在其中调用一下dll,之后在vs中attach到对应的python进程,不要直接attach到sublime等ide程序,因为ide中运行的python程序而不是ide本身直接调用dll文件。







