More than 5 years have passed since last update.

UTF-8文字列を入出力するDLLと文字化け（c++/cli）

Posted at 2017-08-21

c++/cliの文字列を操作するDLL作成を試みましたが、文字化けの解消に苦労して、ネット検索したコードをつなげてなんとか文字化け回避方法が見つかったのでサンプルDLLの作成方法をメモとして記事にしておくことにしました。
c++及びポインタ等の基礎的なことを十分には理解していないため、間違いあるいは不適切なコードの可能性も高いので指摘してくださる人がいればありがたいと思っています。

環境
Windows10 Pro 64bit(H270 chipset) and Windows10 Home 32bit(OSX-Elcapitan PararellsDesktop)
VisualStudio2017

作成手順
1 Visualstudioでc++空のプロジェクトを新規作成(HtmlSource)
2 HtmlSource.h、HtmlSource.cpp、HtmlSource.defを追加
3 プロジェクトのプロパティを変更

出力「dll」
文字セット「Unicode」
共通言語ランタイムサポート「clr」
「リンカー」 - 「入力」 - 「モジュール定義ファイル」がHtmlSource.defになっていることを確認

　例として、Qiitaを「文字化け」で検索したページソースを取得するサンプルDLLを作成
　文字化けがないことを確認するため、全角文字を含むurl引数をDLL中でテキストファイルに保存
　また、DLL内部で作成した全角文字列変数もテキストファイルに保存し文字化けの確認に利用した。

//---------- HtmlSource.def
LIBRARY

EXPORTS
	 ReadSource

//---------- HtmlSource.h
# pragma once

const char* __stdcall ReadSource(const char* url, const char* enc, const char* start, const char* end);

//---------- HtmlSource.cpp
# include "HtmlSource.h"

# include <msclr/marshal_cppstd.h>

# using <mscorlib.dll>
# using <System.dll>

//using namespace msclr::interop;

using namespace System;
using namespace System::Text;
using namespace System::Net;
using namespace System::IO;
using namespace System::Runtime::InteropServices;

//プロトタイプ
System::String^ ConvertFromUtf8Chars(const char** strInput);
const char* ToUtf8(String^ & source);
const char* ToUtf8_2(String^ & source);		//方法2

void GetHtml(String^ url, Encoding^ enc, String^ start, String^ end, String^ & source);

const char* __stdcall ReadSource(const char* c_url, const char* c_enc, const char* c_start, const char* c_end) {
	System::String^ url = ConvertFromUtf8Chars(&c_url);
	System::String^ start = ConvertFromUtf8Chars(&c_start);
	System::String^ end = ConvertFromUtf8Chars(&c_end);
	Encoding^ enc = Encoding::UTF8;
	if (c_enc == "shift_jis")
		enc = Encoding::GetEncoding("shift_jis");
	else if (c_enc == "euc")
		enc = Encoding::GetEncoding("euc-jp");

	String^ source = nullptr;
	GetHtml(url, enc, start, end, source);

	String^ file = "c:\\data\\Tradestation\\temp\\cpp_testChar_input.txt";
	IO::File::WriteAllText(file, "引数 const chr* c_url\n" + url, Encoding::UTF8);

	String^ str = "DLL内部のString^";
	file = "c:\\data\\Tradestation\\temp\\cpp_testChar_inner.txt";
	IO::File::WriteAllText(file, str, Encoding::UTF8);

	return ToUtf8(source);
	//return ToUtf8_2(source);

}

void GetHtml(String^ url, Encoding^ enc, String^ start, String^ end, String^ & source)
{

	WebClient^ wc = gcnew WebClient;
	System::IO::Stream^ st = wc->OpenRead(url);
	System::IO::StreamReader^ sr = gcnew System::IO::StreamReader(st, enc);
	String^ s = sr->ReadToEnd();
	sr->Close();
	st->Close();
	
	int pos = (start == nullptr || start == "") ? 0 : s->IndexOf(start);
	if (pos < 0)
		pos = 0;
	int pos2 = s->Length;
	if (start != nullptr && start != "" && s->Contains(end)) {
		pos2 = s->IndexOf(end, pos);
		if (pos2 < 0)
			pos2 = s->Length;
	}
	source = s->Substring(pos, pos2 - pos);
	//ローカル変数wc,st,srはここを抜けたらdisposeされる？
}

//引数をString^に変換
//https://msdn.microsoft.com/ja-jp/library/tz333b9s.aspx char * 文字列を System::Byte 配列に変換する
String^ ConvertFromUtf8Chars(const char** strInput) {
	char* buf = (char *)*strInput;
	int len = strlen(buf);
	array< Byte >^ input = gcnew array< Byte >(len); //originalは len+2
	// convert native pointer to System::IntPtr with C-Style cast  
	Marshal::Copy((IntPtr)buf, input, 0, len);
	System::Text::Encoding^ enc = System::Text::Encoding::UTF8;
	return enc->GetString(input);
}


//UTF-8出力　方法1
const char* ToUtf8(String^ & source) {

	Encoding^ enc_sjis = Encoding::GetEncoding(932);
	Encoding^ enc_utf8 = Encoding::UTF8;

	//UTF-8のバイト配列に変換
	array<System::Byte>^ buff = enc_utf8->GetBytes(source);

	//バイト配列をchar*に変換
	//http://memeplex.blog.shinobi.jp/c---cli/c---cli%20%20%20%E3%83%9E%E3%83%8D%E3%83%BC%E3%82%B8%E9%85%8D%E5%88%97%E3%81%8B%E3%82%89%E3%83%8D%E3%82%A4%E3%83%86%E3%82%A3%E3%83%96%E3%81%AE%E9%85%8D%E5%88%97%E3%81%B8%E5%A4%89%E6%8F%9B
	//https://stackoverflow.com/questions/7707985/how-to-convert-arraysystembyte-to-char-in-c-clr How to convert array<System::Byte> to char* in C++ CLR?

	pin_ptr<System::Byte> p = &buff[0];

	unsigned char* pby = p;
	char* pch = reinterpret_cast<char*>(pby);
	return pch;
}



//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
//UTF-8出力　方法2
//http://www.atmark.gr.jp/~s2000/r/rtl/encode.html //UTF-8エンコードについて
// ShiftJISコード文字列を 一度 Unicode に変換してから UFT-8 へと変換し、そのデータを返す
// 内部でメモリ確保しているため、使用後は外部でメモリを解放する必要がある
LPSTR SJIStoUTF8(LPCSTR lpText)
{
	if (lpText == NULL || *lpText == '\0') {
		return NULL;
	}

	// (1) ShiftJIS 文字列を Unicode に変換
	//     ワイド文字列変換に必要な文字数分のバッファを確保
	const int cchWideChar = ::MultiByteToWideChar(CP_ACP, 0, lpText, -1, NULL, 0);
	LPWSTR lpw = new WCHAR[cchWideChar];
	if (lpw == NULL) {
		return NULL;
	}
	*lpw = L'\0';

	//     上記で求めたワイド文字列バッファを用いて Unicode に変換
	const int nUnicodeCount = ::MultiByteToWideChar(CP_ACP, 0, lpText, -1, lpw, cchWideChar);
	if (nUnicodeCount <= 0) {
		delete[] lpw;
		return NULL;
	}

	// (2) Unicode 文字列を UTF-8 に変換
	//     マルチバイト文字列変換に必要な文字数分のバッファを確保
	const int cchMultiByte = ::WideCharToMultiByte(CP_UTF8, 0, lpw, -1, NULL, 0, NULL, NULL);
	LPSTR lpa = new CHAR[cchMultiByte];
	if (lpa == NULL) {
		delete[] lpw;
		return NULL;
	}
	*lpa = '\0';

	//     上記で求めたマルチバイト文字列バッファを用いて UTF-8 に変換
	const int nMultiCount = ::WideCharToMultiByte(CP_UTF8, 0, lpw, -1, lpa, cchMultiByte, NULL, NULL);
	if (nMultiCount <= 0) {
		delete[] lpw;
		delete[] lpa;
		return NULL;
	}

	// (3) 変換成功。変換に使った一時バッファを解放
	delete[] lpw;

	return lpa;
}

//http://d.hatena.ne.jp/kasei_san/20070618/p1 //System:Stringを色々な型に変換
// System::String → char*
static const char* systemStringToChar(System::String^ & systemStr)
{
	//using namespace System;
	//using namespace System::Runtime::InteropServices;

	// 文字コードは、環境に合わせる(普通はUTF-8)
	int len = System::Text::Encoding::GetEncoding("UTF-8")->GetByteCount(systemStr);
	if (len > 0) {
		char* rtnSts = new char[len + 1];
		memset(rtnSts, 0x00, sizeof(char)*len + 1);
		const char* buf = static_cast<const char*>((Marshal::StringToHGlobalAnsi(systemStr)).ToPointer());
		// 取得した文字列をコピー
		strncpy_s(rtnSts, len + 1, buf, _TRUNCATE);
		// メモリ開放
		Marshal::FreeHGlobal(IntPtr((void*)buf));
		return rtnSts;
	}
	return NULL;
}

const char* ToUtf8_2(String^ & source) {
	//まずはconst char*に変換
	const char* c_buf = systemStringToChar(source);
	return SJIStoUTF8(c_buf);	//UTF-8に変換
}
//UTF-8出力　方法2
//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

//--------- C# 確認用Project
using System;
using System.Text;
using System.Runtime.InteropServices;


namespace sConsole
{
    class Program
    {
        [DllImport("c:\\data\\Tradestation\\dll\\HtmlSource.dll")]
        private static extern IntPtr ReadSource(IntPtr url, string enc, string column_separator, string row_separator);
        static void Main(string[] args)
        {
            string url = "http://qiita.com/search?q=文字化け";
            IntPtr pOut = ReadSource(NativeUtf8FromString(url), "utf-8", "</form>", "pagination\"");
            String result = StringFromNativeUtf8(pOut);
            Console.WriteLine(result);

            Console.WriteLine("-----------------------------------------");
            string s1 = System.IO.File.ReadAllText("c:\\data\\Tradestation\\temp\\cpp_testChar_input.txt");
            Console.WriteLine("ファイル1 cpp_testChar_input.txt");
            Console.WriteLine(s1);
            string s2 = System.IO.File.ReadAllText("c:\\data\\Tradestation\\temp\\cpp_testChar_inner.txt");
            Console.WriteLine("ファイル2 cpp_testChar_inner.txt");
            Console.WriteLine(s2);
            Console.WriteLine("    hit any key then quit");
            Console.ReadKey();
        }
        //これでUtf-8相互変換できる！
        //https://stackoverflow.com/questions/10773440/conversion-in-net-native-utf-8-managed-string  Conversion in .net: Native Utf-8 <-> Managed String
        private static IntPtr NativeUtf8FromString(string managedString)
        {
            int len = Encoding.UTF8.GetByteCount(managedString);
            byte[] buffer = new byte[len + 1];
            Encoding.UTF8.GetBytes(managedString, 0, managedString.Length, buffer, 0);
            IntPtr nativeUtf8 = Marshal.AllocHGlobal(buffer.Length);
            Marshal.Copy(buffer, 0, nativeUtf8, buffer.Length);
            return nativeUtf8;
        }

        private static string StringFromNativeUtf8(IntPtr nativeUtf8)
        {
            int len = 0;
            while (Marshal.ReadByte(nativeUtf8, len) != 0) ++len;
            byte[] buffer = new byte[len];
            Marshal.Copy(nativeUtf8, buffer, 0, buffer.Length);
            return Encoding.UTF8.GetString(buffer);
        }
    }
}

戻り値をSystem::String^ __clrcall、あるいは引数をString^変数としてもエラーがなく入出力できましたが、文字化けの解消はできずconst char* で出し入れするようにしました。
また、UTF-8出力は今の所２つの方法が見つかったので両方ともコードに残していますが方法２はデフォルトでは利用していません。
　
今回色々と調べたことで、C#の文字列が参照型だと改めて思い出しました。
C#を独学した初めの頃はヘルプを読んでこれを意識して文字列の変更はStringBuilerばかり使っていたのですが、長年値型のつもりで気楽に使うようになっていました。

参考にさせていただいたページの作者の方々には、この場を借りて深く感謝申し上げます。
間違い等があればご指摘をお願いします。

Reference
1 今さら聞けない、教えてもらえない!! Unicode /マルチバイト文字対応国際化VC ++ プログラミングの基礎!!
2 C++/CLI Tips 文字列操作
3 [C++/CLI]Stringの糸口　System::String^ はハンドル型なのに実体のような動きをする
4 方法: char * 文字列を System::Byte 配列に変換する
5 C++/CLI マネージ配列からネイティブの配列へ変換
6 How to convert array System::Byte to char* in C++ CLR?
7 UTF-8エンコードについて
8 System:Stringを色々な型に変換
9 Conversion in .net: Native Utf-8 <-> Managed String

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up