/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.document;

import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Properties;
import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.ErrorLogger;
import net.sf.regain.crawler.Profiler;
import net.sf.regain.crawler.access.CrawlerAccessController;
import net.sf.regain.crawler.config.AuxiliaryField;
import net.sf.regain.crawler.config.CrawlerConfig;
import net.sf.regain.crawler.config.PreparatorSettings;
import net.sf.regain.crawler.document.PathElement;
import net.sf.regain.crawler.document.Preparator;
import net.sf.regain.crawler.document.PreparatorFactory;
import net.sf.regain.crawler.document.PreparatorProfilerPair;
import net.sf.regain.crawler.document.RawDocument;
import net.sf.regain.crawler.document.WriteablePreparator;
import net.sf.regain.crawler.plugin.CrawlerPluginManager;
import net.sf.regain.util.io.PathFilenamePair;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifierFactory;

public class DocumentFactory {
    private static final String MIME_TYPE_UNKNOWN = "application/x-unknown-mime-type";
    private static Logger mLog = Logger.getLogger(DocumentFactory.class);
    private CrawlerConfig mConfig;
    private int mMaxSummaryLength;
    private boolean storeContentForPreview;
    private File mAnalysisDir = null;
    private Preparator[] mPreparatorArr;
    private Profiler[] mPreparatorProfilerArr;
    private CrawlerAccessController mCrawlerAccessController;
    private RE[] mUseLinkTextAsTitleReArr;
    private Profiler mWriteAnalysisProfiler = new Profiler("Writing Analysis files", "files");
    MimeTypeIdentifier mimeTypeIdentifier;
    private CrawlerPluginManager pluginManager = CrawlerPluginManager.getInstance();

    public DocumentFactory(CrawlerConfig config, File analysisDir) throws RegainException {
        String[] useLinkTextAsTitleRegexArr;
        this.mConfig = config;
        this.mAnalysisDir = analysisDir;
        try {
            PreparatorSettings[] prepConf = config.getPreparatorSettingsList();
            this.mPreparatorArr = PreparatorFactory.getInstance().createPreparatorArr(prepConf);
        }
        catch (RegainException exc) {
            throw new RegainException("Creating the document preparators failed", exc);
        }
        this.mPreparatorProfilerArr = new Profiler[this.mPreparatorArr.length];
        for (int i = 0; i < this.mPreparatorProfilerArr.length; ++i) {
            String name = this.mPreparatorArr[i].getClass().getName();
            this.mPreparatorProfilerArr[i] = new Profiler("Preparator " + name, "docs");
        }
        String accessClass = config.getCrawlerAccessControllerClass();
        if (accessClass != null) {
            String accessJar = config.getCrawlerAccessControllerJar();
            this.mCrawlerAccessController = (CrawlerAccessController)RegainToolkit.createClassInstance(accessClass, CrawlerAccessController.class, accessJar);
            Properties accessControllerConfig = config.getCrawlerAccessControllerConfig();
            if (accessControllerConfig == null) {
                accessControllerConfig = new Properties();
            }
            this.mCrawlerAccessController.init(accessControllerConfig);
            mLog.info("Using crawler access controller: " + accessClass);
        }
        if ((useLinkTextAsTitleRegexArr = config.getUseLinkTextAsTitleRegexList()) == null) {
            this.mUseLinkTextAsTitleReArr = new RE[0];
        } else {
            this.mUseLinkTextAsTitleReArr = new RE[useLinkTextAsTitleRegexArr.length];
            for (int i = 0; i < useLinkTextAsTitleRegexArr.length; ++i) {
                try {
                    this.mUseLinkTextAsTitleReArr[i] = new RE(useLinkTextAsTitleRegexArr[i]);
                    continue;
                }
                catch (RESyntaxException exc) {
                    throw new RegainException("Regular expression of use-link-text-as-title-pattern #" + i + " has wrong syntax '" + useLinkTextAsTitleRegexArr[i] + "'", exc);
                }
            }
        }
        this.mMaxSummaryLength = this.mConfig.getMaxSummaryLength();
        this.storeContentForPreview = this.mConfig.getStoreContentForPreview();
        MagicMimeTypeIdentifierFactory factory = new MagicMimeTypeIdentifierFactory();
        this.mimeTypeIdentifier = factory.get();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public Document createDocument(RawDocument rawDocument, ErrorLogger errorLogger) {
        PreparatorProfilerPair preparatorProfiler;
        String mimeType;
        FileInputStream fis = null;
        try {
            File file = rawDocument.getContentAsFile();
            if (!file.canRead()) {
                mLog.warn("canRead() on file return: false. Maybe no access rights for sourceURL: " + RegainToolkit.fileToUrl(file));
                Document document = null;
                return document;
            }
            fis = new FileInputStream(file);
            byte[] bytes = new byte[this.mimeTypeIdentifier.getMinArrayLength()];
            fis.read(bytes);
            mimeType = this.mimeTypeIdentifier.identify(bytes, file.getPath(), new URIImpl(rawDocument.getUrl(), false));
            if (mimeType == null || mimeType.length() == 0) {
                mimeType = MIME_TYPE_UNKNOWN;
            }
            mLog.debug("Detected mimetype cycle 1: " + mimeType + ". " + rawDocument.getUrl());
            if (mimeType.equalsIgnoreCase("application/zip")) {
                mimeType = this.mimeTypeIdentifier.identify(bytes, null, new URIImpl("zip:mime:file:" + rawDocument.getUrl()));
                mLog.debug("Detected mimetype cycle 2: " + mimeType + ". " + "zip:mime:file:" + rawDocument.getUrl());
            }
        }
        catch (Exception exc) {
            errorLogger.logError("Determine mime-type of " + rawDocument.getUrl() + " failed", exc, false);
            mimeType = MIME_TYPE_UNKNOWN;
        }
        finally {
            if (fis != null) {
                try {
                    fis.close();
                }
                catch (IOException e) {}
            }
        }
        rawDocument.setMimeType(mimeType);
        PriorityQueue<PreparatorProfilerPair> matchingPreparators = new PriorityQueue<PreparatorProfilerPair>(this.mPreparatorArr.length);
        for (int i = 0; i < this.mPreparatorArr.length; ++i) {
            if (!this.mPreparatorArr[i].accepts(rawDocument)) continue;
            matchingPreparators.add(new PreparatorProfilerPair(this.mPreparatorArr[i], this.mPreparatorProfilerArr[i]));
            if (!mLog.isDebugEnabled()) continue;
            mLog.debug("Found: " + this.mPreparatorArr[i].getClass().getSimpleName() + ", Prio: " + this.mPreparatorArr[i].getPriority());
        }
        Document doc = null;
        boolean preparatorFound = false;
        while (doc == null && (preparatorProfiler = (PreparatorProfilerPair)matchingPreparators.poll()) != null) {
            preparatorFound = true;
            try {
                doc = this.createDocument(preparatorProfiler.getPreparator(), preparatorProfiler.getProfiler(), rawDocument);
                mLog.info("Preparation with " + preparatorProfiler.getPreparator().getClass().getSimpleName() + " done: " + rawDocument.getUrl());
            }
            catch (RegainException exc) {
                errorLogger.logError("Preparing " + rawDocument.getUrl() + " with preparator " + preparatorProfiler.getPreparator().getClass().getName() + " failed", exc, false);
            }
        }
        if (!preparatorFound) {
            mLog.info("No preparator feels responsible for " + rawDocument.getUrl());
        } else if (doc == null) {
            try {
                doc = this.createSubstituteDocument(rawDocument);
                mLog.info("Created substitute document: " + rawDocument.getUrl());
            }
            catch (RegainException exc) {
                errorLogger.logError("Creating substitute document for " + rawDocument.getUrl() + " failed", exc, false);
            }
        }
        return doc;
    }

    private Document createDocument(Preparator preparator, Profiler preparatorProfiler, RawDocument rawDocument) throws RegainException {
        Map<String, String> additionalFieldMap;
        PathElement[] path;
        String headlines;
        String metadata;
        String summary;
        String title;
        String cleanedContent;
        String url = rawDocument.getUrl();
        if (mLog.isDebugEnabled()) {
            mLog.debug("Using preparator " + preparator.getClass().getName() + " for " + rawDocument + ", " + rawDocument.getMimeType());
        }
        preparatorProfiler.startMeasuring();
        this.pluginManager.eventBeforePrepare(rawDocument, (WriteablePreparator)preparator);
        try {
            preparator.prepare(rawDocument);
            this.pluginManager.eventAfterPrepare(rawDocument, (WriteablePreparator)preparator);
            cleanedContent = preparator.getCleanedContent();
            title = preparator.getTitle();
            summary = preparator.getSummary();
            metadata = preparator.getCleanedMetaData();
            headlines = preparator.getHeadlines();
            path = preparator.getPath();
            additionalFieldMap = preparator.getAdditionalFields();
            preparator.cleanUp();
            preparatorProfiler.stopMeasuring(rawDocument.getLength());
        }
        catch (Throwable thr) {
            preparatorProfiler.abortMeasuring();
            throw new RegainException("Preparing " + url + " with preparator " + preparator.getClass().getName() + " failed", thr);
        }
        if (cleanedContent == null) {
            throw new RegainException("Preparator " + preparator.getClass().getName() + " did not extract the content of " + url);
        }
        Document doc = this.createDocument(rawDocument, cleanedContent, title, summary, metadata, headlines, path, additionalFieldMap);
        return doc;
    }

    private Document createSubstituteDocument(RawDocument rawDocument) throws RegainException {
        return this.createDocument(rawDocument, null, null, null, null, null, null, null);
    }

    private Document createDocument(RawDocument rawDocument, String cleanedContent, String title, String summary, String metadata, String headlines, PathElement[] path, Map<String, String> additionalFieldMap) throws RegainException {
        String url = rawDocument.getUrl();
        String docPath = null;
        Document doc = new Document();
        AuxiliaryField[] auxiliaryFieldArr = this.mConfig.getAuxiliaryFieldList();
        if (auxiliaryFieldArr != null) {
            for (int i = 0; i < auxiliaryFieldArr.length; ++i) {
                AuxiliaryField auxiliaryField = auxiliaryFieldArr[i];
                RE regex = auxiliaryField.getRegex();
                String sourceValue = url;
                if (auxiliaryField.getSourceField() == AuxiliaryField.SourceField.PATH) {
                    if (docPath == null) {
                        if (url.startsWith("file:")) {
                            File file = RegainToolkit.urlToFile(url);
                            docPath = file.getAbsolutePath();
                        } else {
                            docPath = url;
                        }
                    }
                    sourceValue = docPath;
                }
                if (!regex.match(sourceValue)) continue;
                String targetFieldName = auxiliaryField.getTargetFieldName();
                String value = auxiliaryField.getValue();
                if (value == null) {
                    value = regex.getParen(auxiliaryField.getRegexGroup());
                }
                if (value == null) continue;
                if (auxiliaryField.getToLowerCase()) {
                    value = value.toLowerCase();
                }
                if (mLog.isDebugEnabled()) {
                    mLog.debug("Adding auxiliary field: " + targetFieldName + "=" + value);
                }
                boolean store = auxiliaryField.isStored();
                boolean index = auxiliaryField.isIndexed();
                boolean token = auxiliaryField.isTokenized();
                doc.add(new Field(targetFieldName, value, store ? Field.Store.YES : Field.Store.NO, index ? (token ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO));
            }
        }
        if (this.mCrawlerAccessController != null) {
            String[] groupArr = this.mCrawlerAccessController.getDocumentGroups(rawDocument);
            RegainToolkit.checkGroupArray(this.mCrawlerAccessController, groupArr);
            StringBuilder tokenBuilder = new StringBuilder();
            for (String group : groupArr) {
                tokenBuilder.append(group).append(" ");
            }
            doc.add(new Field("groups", new WhitespaceTokenizer(RegainToolkit.getLuceneVersion(), (Reader)new StringReader(tokenBuilder.toString()))));
        }
        if (url == null) {
            url = "";
        }
        doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
        String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
        doc.add(new Field("filename", filenameWithVariants, Field.Store.YES, Field.Index.ANALYZED));
        PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);
        doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        int size = rawDocument.getLength();
        doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));
        String mimeType = rawDocument.getMimeType();
        if (mimeType == null) {
            mimeType = MIME_TYPE_UNKNOWN;
        }
        doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
        Date lastModified = rawDocument.getLastModified();
        if (lastModified == null) {
            lastModified = new Date();
        }
        doc.add(new Field("last-modified", DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED));
        this.writeContentAnalysisFile(rawDocument);
        if (additionalFieldMap != null) {
            for (Map.Entry<String, String> entry : additionalFieldMap.entrySet()) {
                String fieldName = entry.getKey();
                String fieldValue = entry.getValue();
                doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
                doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue)));
            }
        }
        if (this.hasContent(cleanedContent)) {
            this.writeAnalysisFile(url, "clean", cleanedContent);
            doc.add(new Field("content", cleanedContent, this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
        } else {
            doc.add(new Field("preparation-error", "true", Field.Store.YES, Field.Index.NO));
        }
        for (int i = 0; i < this.mUseLinkTextAsTitleReArr.length; ++i) {
            if (!this.mUseLinkTextAsTitleReArr[i].match(url)) continue;
            String linkText = rawDocument.getSourceLinkText();
            if (linkText == null) break;
            title = linkText;
            break;
        }
        if (this.hasContent(title)) {
            doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        } else {
            doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
        }
        if (!this.hasContent(summary) && this.hasContent(cleanedContent)) {
            summary = this.createSummaryFromContent(cleanedContent);
        }
        if (this.hasContent(summary)) {
            doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
            doc.add(new Field("summary", CompressionTools.compressString(summary)));
        }
        if (this.hasContent(metadata)) {
            doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
        }
        if (this.hasContent(headlines)) {
            doc.add(new Field("headlines", headlines, Field.Store.NO, Field.Index.ANALYZED));
        }
        if (pfPair.getPath() != null) {
            doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
            doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            this.writeAnalysisFile(url, "path", pfPair.getPath());
        } else {
            doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
        }
        return doc;
    }

    private boolean hasContent(String str) {
        return str != null && str.length() != 0;
    }

    private String createSummaryFromContent(String content) {
        if (content.length() > this.mMaxSummaryLength) {
            int lastSpacePos = content.lastIndexOf(32, this.mMaxSummaryLength);
            if (lastSpacePos == -1) {
                return null;
            }
            return content.substring(0, lastSpacePos) + "...";
        }
        return content;
    }

    private String pathToString(PathElement[] path) {
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < path.length; ++i) {
            builder.append(path[i].getUrl());
            builder.append(' ');
            builder.append(path[i].getTitle());
            builder.append('\n');
        }
        return builder.toString();
    }

    private void writeContentAnalysisFile(RawDocument rawDocument) {
        if (this.mAnalysisDir == null) {
            return;
        }
        File file = this.getAnalysisFile(rawDocument.getUrl(), null);
        this.mWriteAnalysisProfiler.startMeasuring();
        try {
            rawDocument.writeToFile(file);
            this.mWriteAnalysisProfiler.stopMeasuring(rawDocument.getLength());
        }
        catch (RegainException exc) {
            this.mWriteAnalysisProfiler.abortMeasuring();
            mLog.error("Writing analysis file failed", exc);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void writeAnalysisFile(String url, String extension, String content) {
        if (this.mAnalysisDir == null) {
            return;
        }
        File file = this.getAnalysisFile(url, extension);
        if (content == null) {
            throw new NullPointerException("Content for analysis file is null: " + file.getAbsolutePath());
        }
        this.mWriteAnalysisProfiler.startMeasuring();
        FileOutputStream stream = null;
        OutputStreamWriter writer = null;
        try {
            stream = new FileOutputStream(file);
            writer = new OutputStreamWriter(stream);
            writer.write(content);
            this.mWriteAnalysisProfiler.stopMeasuring(content.length());
        }
        catch (IOException exc) {
            this.mWriteAnalysisProfiler.abortMeasuring();
            mLog.error("Writing analysis file failed", exc);
        }
        finally {
            if (writer != null) {
                try {
                    writer.close();
                }
                catch (IOException exc) {}
            }
            if (stream != null) {
                try {
                    stream.close();
                }
                catch (IOException exc) {}
            }
        }
    }

    private File getAnalysisFile(String url, String extension) {
        if (url.startsWith("http://") || url.startsWith("file://")) {
            url = url.substring(7);
        }
        url = RegainToolkit.replace(url, ":", "_");
        url = RegainToolkit.replace(url, "/", "_");
        if (extension == null) {
            return new File(this.mAnalysisDir, url);
        }
        return new File(this.mAnalysisDir, url + "." + extension);
    }

    public void close() {
        for (int i = 0; i < this.mPreparatorArr.length; ++i) {
            mLog.info("Closing preparator " + this.mPreparatorArr[i].getClass().getName());
            try {
                this.mPreparatorArr[i].close();
                continue;
            }
            catch (Throwable thr) {
                mLog.error("Closing preparator failed: " + this.mPreparatorArr[i].getClass().getName(), thr);
            }
        }
        this.mPreparatorArr = null;
        if (this.mCrawlerAccessController != null && this.mCrawlerAccessController instanceof Closeable) {
            Closeable c = (Closeable)((Object)this.mCrawlerAccessController);
            try {
                c.close();
            }
            catch (IOException e) {
                mLog.error("Closing CrawlerAccessController failed:", e);
            }
        }
    }
}

