元々はウォーターマークを取ることができないかを調べていたが、ウォーターマークの実体はテキストだったので。
iTextSharpのソース(特にPdfContentStreamProcessor.cs)を参考にした。
指定した文字列と完全一致したら削除する
private class ResourceDictionary : PdfDictionary
{
private IList<PdfDictionary> resourcesStack = new List<PdfDictionary>();
virtual public void Push(PdfDictionary resources)
{
resourcesStack.Add(resources);
}
virtual public void Pop()
{
resourcesStack.RemoveAt(resourcesStack.Count - 1);
}
public override PdfObject GetDirectObject(PdfName key)
{
for (int i = resourcesStack.Count - 1; i >= 0; i--)
{
PdfDictionary subResource = resourcesStack[i];
if (subResource != null)
{
PdfObject obj = subResource.GetDirectObject(key);
if (obj != null) return obj;
}
}
return base.GetDirectObject(key); // shouldn't be necessary, but just in case we've done something crazy
}
}
private Stack<GraphicsState> gsStack = new Stack<GraphicsState>();
public GraphicsState Gs()
{
return gsStack.Peek();
}
private IDictionary<int, CMapAwareDocumentFont> cachedFonts = new Dictionary<int, CMapAwareDocumentFont>();
private CMapAwareDocumentFont GetFont(PRIndirectReference ind)
{
CMapAwareDocumentFont font;
cachedFonts.TryGetValue(ind.Number, out font);
if (font == null)
{
font = new CMapAwareDocumentFont(ind);
cachedFonts[ind.Number] = font;
}
return font;
}
private class TextAreaInfo
{
public long start;
public long end;
public TextAreaInfo(long s, long e)
{
start = s;
end = e;
}
}
private string InPdf;
private string OutPdf;
PdfReader reader;
public PDFContentsRemover(string input, string output)
{
InPdf = input;
OutPdf = output;
reader = new PdfReader(InPdf);
}
public void RemoveString(string keyword)
{
int pc = reader.NumberOfPages;
for (int i = 1; i <= pc; i++)
{
Console.WriteLine("ページ番号:" + i);
PdfDictionary page = reader.GetPageN(i);
PRStream stream;
PdfArray contentarray;
contentarray = page.GetAsArray(PdfName.CONTENTS);
List<TextAreaInfo> lArea = new List<TextAreaInfo>();
if (contentarray != null)
{
for (int j = 0; j < contentarray.Size; j++)
{
stream = (PRStream)contentarray.GetAsStream(j);
PRTokeniser tokeniser = new PRTokeniser(
new RandomAccessFileOrArray(
new RandomAccessSourceFactory().CreateSource(
ContentByteUtils.GetContentBytesFromContentObject(stream)))); // streamの内容をパース。コマンドとコンテンツを取得
PdfContentParser ps = new PdfContentParser(tokeniser);
List<PdfObject> operands = new List<PdfObject>();
CMapAwareDocumentFont font = null;
long posq = 0, posQ = 0;
bool isText = false, isMatch = false;
while (ps.Parse(operands).Count > 0)
{
PdfLiteral cmd = null;
try
{
cmd = (PdfLiteral)operands[operands.Count - 1];// operands[]の最後がコマンド
}
catch (Exception)
{
// ファイルが破損しているとコマンドが取れない
throw new PDFContentsRemoverException(cmd == null ? "NULL" : cmd.ToString());
}
Console.WriteLine(cmd);
if (cmd.ToString() == "q") // GraphicsStateの開始
{
posQ = -1;
try
{
posq = tokeniser.FilePointer;
// Pointerの位置がqの後の改行を指している場合
if (PdfReader.GetStreamBytes(stream)[posq] != 'q') posq --;
}
catch (Exception)
{
throw;
}
}
else if (cmd.ToString() == "Q") // GraphicsStateの終了
{
if (posq > 0 && isText && isMatch)
{
// streamから消す区間
try
{
posQ = tokeniser.FilePointer;
// Pointerの位置がQの後の改行を指している場合
//if (PdfReader.GetStreamBytes(stream)[posQ] != 'Q') posQ --;
}
catch (Exception)
{
throw;
}
lArea.Add(new TextAreaInfo(posq, posQ));
}
posQ = -1;
posq = -1;
isText = false;
isMatch = false;
}
else if (cmd.ToString() == "BT") // テキストエリアの開始
{
isText = true;
}
else if (cmd.ToString() == "ET") { }// テキストエリアの終了
else if (cmd.ToString() == "Tf") // フォント
{
PdfName fontResourceName = (PdfName)operands[0];
float size = ((PdfNumber)operands[1]).FloatValue;
PdfDictionary resourcesDic = page.GetAsDict(PdfName.RESOURCES);
PdfDictionary fontsDictionary = resourcesDic.GetAsDict(PdfName.FONT);
PdfObject fontObject = fontsDictionary.Get(fontResourceName);
font = GetFont((PRIndirectReference)fontObject);
}
else if (cmd.ToString() == "Tj")
{
PdfString str = (PdfString)operands[0];
byte[] bytes = str.GetBytes();
if (isText && font.Decode(bytes, 0, bytes.Length) == keyword) isMatch = true;
Console.WriteLine(font.Decode(bytes, 0, bytes.Length));
}
else if (cmd.ToString() == "TJ")
{
PdfArray array = (PdfArray)operands[0];
StringBuilder sb = new StringBuilder();
foreach (PdfObject entryObj in array)
{
if (entryObj is PdfString)
{
byte[] bytes = entryObj.GetBytes();
sb.Append(font.Decode(bytes, 0, bytes.Length));
}
}
if (isText && sb.ToString() == keyword) isMatch = true;
Console.WriteLine(sb.ToString());
}
}
lArea.Reverse(); // 出現箇所を後ろから処理
foreach (TextAreaInfo A in lArea)
{
byte[] shalf = PdfReader.GetStreamBytes(stream).Take((int)A.start).ToArray();
byte[] ehalf = PdfReader.GetStreamBytes(stream).Skip((int)A.end + 1).ToArray();
Array.Resize(ref shalf, shalf.Length + ehalf.Length);
Array.Copy(ehalf, 0, shalf, shalf.Length - ehalf.Length, ehalf.Length);
stream.Put(PdfName.LENGTH, new PdfNumber(shalf.Length));
stream.SetData(shalf);
(PdfReader.GetStreamBytes(stream));
}
}
}
}
}