/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.preparator;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;
import org.apache.log4j.Logger;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripper;

public class PdfBoxPreparator
extends AbstractPreparator {
    private static Logger mLog = Logger.getLogger(PdfBoxPreparator.class);

    public PdfBoxPreparator() throws RegainException {
        super("application/pdf");
    }

    public void prepare(RawDocument rawDocument) throws RegainException {
        String url = rawDocument.getUrl();
        InputStream stream = null;
        PDDocument pdfDocument = null;
        try {
            stream = rawDocument.getContentAsStream();
            PDFParser parser = new PDFParser(stream);
            parser.parse();
            pdfDocument = parser.getPDDocument();
            if (pdfDocument.isEncrypted()) {
                mLog.debug((Object)("Document is encrypted: " + url));
                StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
                pdfDocument.openProtection(sdm);
                AccessPermission ap = pdfDocument.getCurrentAccessPermission();
                if (!ap.canExtractContent()) {
                    throw new RegainException("Document is encrypted and can't be opened: " + url);
                }
            }
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSuppressDuplicateOverlappingText(false);
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(Integer.MAX_VALUE);
            this.setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));
            StringBuilder annotsResult = new StringBuilder();
            List allPages = pdfDocument.getDocumentCatalog().getAllPages();
            for (int i = 0; i < allPages.size(); ++i) {
                int pageNum = i + 1;
                PDPage page = (PDPage)allPages.get(i);
                List annotations = page.getAnnotations();
                if (annotations.size() < 1) continue;
                mLog.debug((Object)("Total annotations = " + annotations.size()));
                mLog.debug((Object)("\nProcess Page " + pageNum + "..."));
                for (PDAnnotation annotation : annotations) {
                    if (annotation.getContents() == null || annotation.getContents().length() <= 0) continue;
                    annotsResult.append(annotation.getContents());
                    annotsResult.append(" ");
                    mLog.debug((Object)("Text from annotation: " + annotation.getContents()));
                }
            }
            if (annotsResult.length() > 0) {
                this.setCleanedContent(this.getCleanedContent() + " Annotations " + annotsResult.toString());
            }
            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            StringBuilder metaData = new StringBuilder();
            metaData.append("p.");
            metaData.append(Integer.toString(pdfDocument.getNumberOfPages()));
            metaData.append(" ");
            if (info.getAuthor() != null) {
                metaData.append(info.getAuthor());
                metaData.append(" ");
            }
            if (info.getSubject() != null) {
                metaData.append(info.getSubject());
                metaData.append(" ");
            }
            if (info.getKeywords() != null) {
                metaData.append(info.getKeywords());
                metaData.append(" ");
            }
            if (info.getTitle() != null) {
                this.setTitle(info.getTitle());
            }
            this.setCleanedMetaData(metaData.toString());
            if (mLog.isDebugEnabled()) {
                mLog.debug((Object)("Extracted meta data ::" + this.getCleanedMetaData() + ":: from " + rawDocument.getUrl()));
            }
        }
        catch (CryptographyException exc) {
            throw new RegainException("Error decrypting document: " + url, (Throwable)exc);
        }
        catch (BadSecurityHandlerException exc) {
            throw new RegainException("Document is encrypted: " + url, (Throwable)exc);
        }
        catch (IOException exc) {
            throw new RegainException("Error reading document: " + url, (Throwable)exc);
        }
        finally {
            if (stream != null) {
                try {
                    stream.close();
                }
                catch (Exception exc) {}
            }
            if (pdfDocument != null) {
                try {
                    pdfDocument.close();
                }
                catch (Exception exc) {}
            }
        }
    }
}

