More than 5 years have passed since last update.
iTextSharpを使ってPDFからテキストを削除する

Posted at 2018-09-19
元々はウォーターマークを取ることができないかを調べていたが、ウォーターマークの実体はテキストだったので。
iTextSharpのソース(特にPdfContentStreamProcessor.cs)を参考にした。
指定した文字列と完全一致したら削除する


        private class ResourceDictionary : PdfDictionary
        {
            private IList<PdfDictionary> resourcesStack = new List<PdfDictionary>();

            virtual public void Push(PdfDictionary resources)
            {
                resourcesStack.Add(resources);
            }

            virtual public void Pop()
            {
                resourcesStack.RemoveAt(resourcesStack.Count - 1);
            }

            public override PdfObject GetDirectObject(PdfName key)
            {
                for (int i = resourcesStack.Count - 1; i >= 0; i--)
                {
                    PdfDictionary subResource = resourcesStack[i];
                    if (subResource != null)
                    {
                        PdfObject obj = subResource.GetDirectObject(key);
                        if (obj != null) return obj;
                    }
                }
                return base.GetDirectObject(key); // shouldn't be necessary, but just in case we've done something crazy
            }
        }

        private Stack<GraphicsState> gsStack = new Stack<GraphicsState>();
        public GraphicsState Gs()
        {
            return gsStack.Peek();
        }

        private IDictionary<int, CMapAwareDocumentFont> cachedFonts = new Dictionary<int, CMapAwareDocumentFont>();
        private CMapAwareDocumentFont GetFont(PRIndirectReference ind)
        {
            CMapAwareDocumentFont font;
            cachedFonts.TryGetValue(ind.Number, out font);
            if (font == null)
            {
                font = new CMapAwareDocumentFont(ind);
                cachedFonts[ind.Number] = font;
            }
            return font;
        }

        private class TextAreaInfo
        {
            public long start;
            public long end;
            public TextAreaInfo(long s, long e)
            {
                start = s;
                end = e;
            }
        }

        private string InPdf;
        private string OutPdf;
        PdfReader reader;

        public PDFContentsRemover(string input, string output)
        {
            InPdf = input;
            OutPdf = output;
            reader = new PdfReader(InPdf);
        }

        public void RemoveString(string keyword)
        {

            int pc = reader.NumberOfPages;
            for (int i = 1; i <= pc; i++)
            {
                Console.WriteLine("ページ番号：" + i);

                PdfDictionary page = reader.GetPageN(i);
                PRStream stream;
                PdfArray contentarray;
                contentarray = page.GetAsArray(PdfName.CONTENTS);

                List<TextAreaInfo> lArea = new List<TextAreaInfo>();

                if (contentarray != null)
                {
                    for (int j = 0; j < contentarray.Size; j++)
                    {
                        stream = (PRStream)contentarray.GetAsStream(j);
                        PRTokeniser tokeniser = new PRTokeniser(
                            new RandomAccessFileOrArray(
                                new RandomAccessSourceFactory().CreateSource(
                                    ContentByteUtils.GetContentBytesFromContentObject(stream)))); // streamの内容をパース。コマンドとコンテンツを取得

                        PdfContentParser ps = new PdfContentParser(tokeniser);
                        List<PdfObject> operands = new List<PdfObject>();
                        CMapAwareDocumentFont font = null;

                        long posq = 0, posQ = 0;
                        bool isText = false, isMatch = false;
                        while (ps.Parse(operands).Count > 0)
                        {
                            PdfLiteral cmd = null;
                            try
                            {
                                cmd = (PdfLiteral)operands[operands.Count - 1];// operands[]の最後がコマンド
                            }
                            catch (Exception)
                            {
                                // ファイルが破損しているとコマンドが取れない
                                throw new PDFContentsRemoverException(cmd == null ? "NULL" : cmd.ToString());
                            }
                            
                            Console.WriteLine(cmd);
                            if (cmd.ToString() == "q") // GraphicsStateの開始
                            {
                                posQ = -1;
                                try
                                {
                                    posq = tokeniser.FilePointer;
                                    // Pointerの位置がqの後の改行を指している場合
                                    if (PdfReader.GetStreamBytes(stream)[posq] != 'q') posq --;

                                }
                                catch (Exception)
                                {
                                    throw;
                                }
                                
                            }
                            else if (cmd.ToString() == "Q") // GraphicsStateの終了
                            {
                                if (posq > 0 && isText && isMatch)
                                {
                                    // streamから消す区間
                                    try
                                    {
                                        posQ = tokeniser.FilePointer;

                                        // Pointerの位置がQの後の改行を指している場合
                                        //if (PdfReader.GetStreamBytes(stream)[posQ] != 'Q') posQ --; 
                                    }
                                    catch (Exception)
                                    {
                                        throw;
                                    }
                                    
                                    lArea.Add(new TextAreaInfo(posq, posQ));
                                }
                                posQ = -1;
                                posq = -1;
                                isText = false;
                                isMatch = false;
                            }
                            else if (cmd.ToString() == "BT") // テキストエリアの開始
                            {
                                isText = true;
                            }
                            else if (cmd.ToString() == "ET") { }// テキストエリアの終了
                            else if (cmd.ToString() == "Tf") // フォント
                            {
                                PdfName fontResourceName = (PdfName)operands[0];
                                float size = ((PdfNumber)operands[1]).FloatValue;

                                PdfDictionary resourcesDic = page.GetAsDict(PdfName.RESOURCES);
                                PdfDictionary fontsDictionary = resourcesDic.GetAsDict(PdfName.FONT);

                                PdfObject fontObject = fontsDictionary.Get(fontResourceName);
                                font = GetFont((PRIndirectReference)fontObject);
                            }
                            else if (cmd.ToString() == "Tj")  
                            {
                                PdfString str = (PdfString)operands[0];
                                byte[] bytes = str.GetBytes();
                                if (isText && font.Decode(bytes, 0, bytes.Length) == keyword) isMatch = true;

                                Console.WriteLine(font.Decode(bytes, 0, bytes.Length));
                            }
                            else if (cmd.ToString() == "TJ") 
                            {
                                PdfArray array = (PdfArray)operands[0];
                                StringBuilder sb = new StringBuilder();
                                foreach (PdfObject entryObj in array)
                                {
                                    if (entryObj is PdfString)
                                    {
                                        byte[] bytes = entryObj.GetBytes();
                                        sb.Append(font.Decode(bytes, 0, bytes.Length));
                                    }
                                }
                                if (isText && sb.ToString() == keyword) isMatch = true;
                                Console.WriteLine(sb.ToString());
                            }
                        }

                        lArea.Reverse(); // 出現箇所を後ろから処理
                        foreach (TextAreaInfo A in lArea)
                        {
                            byte[] shalf = PdfReader.GetStreamBytes(stream).Take((int)A.start).ToArray();
                            byte[] ehalf = PdfReader.GetStreamBytes(stream).Skip((int)A.end + 1).ToArray();
                            Array.Resize(ref shalf, shalf.Length + ehalf.Length);
                            Array.Copy(ehalf, 0, shalf, shalf.Length - ehalf.Length, ehalf.Length);

                            stream.Put(PdfName.LENGTH, new PdfNumber(shalf.Length));
                            stream.SetData(shalf);
(PdfReader.GetStreamBytes(stream));
                        }
                    }
                }
            }

        }
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up