/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.preparator;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import net.sf.regain.RegainException;
import net.sf.regain.crawler.CrawlerToolkit;
import net.sf.regain.crawler.config.PreparatorConfig;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.PathElement;
import net.sf.regain.crawler.document.RawDocument;
import net.sf.regain.crawler.preparator.html.HtmlContentExtractor;
import net.sf.regain.crawler.preparator.html.HtmlPathExtractor;
import net.sf.regain.crawler.preparator.html.LinkVisitor;
import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.beans.StringBean;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.ParserException;

public class HtmlPreparator
extends AbstractPreparator {
    private static Logger mLog = Logger.getLogger(HtmlPreparator.class);
    private List<HtmlContentExtractor> mContentExtractorList;
    private List<HtmlPathExtractor> mPathExtractorList;

    public HtmlPreparator() throws RegainException {
        super(new String[]{"text/html", "application/xhtml+xml"});
    }

    public void init(PreparatorConfig config) throws RegainException {
        String prefix;
        List sectionList = config.getSectionsWithNameList("contentExtractor");
        this.mContentExtractorList = new ArrayList<HtmlContentExtractor>(sectionList.size());
        for (Map section : sectionList) {
            prefix = (String)section.get("prefix");
            String contentStartRegex = (String)section.get("startRegex");
            String contentEndRegex = (String)section.get("endRegex");
            String headlineRegex = (String)section.get("headlineRegex");
            int headlineRegexGroup = this.getIntParam(section, "headlineRegex.group");
            this.mContentExtractorList.add(new HtmlContentExtractor(prefix, contentStartRegex, contentEndRegex, headlineRegex, headlineRegexGroup));
        }
        sectionList = config.getSectionsWithNameList("pathExtractor");
        this.mPathExtractorList = new ArrayList<HtmlPathExtractor>(sectionList.size());
        for (Map section : sectionList) {
            prefix = (String)section.get("prefix");
            String pathStartRegex = (String)section.get("startRegex");
            String pathEndRegex = (String)section.get("endRegex");
            String pathNodeRegex = (String)section.get("pathNodeRegex");
            int pathNodeUrlGroup = this.getIntParam(section, "pathNodeRegex.urlGroup");
            int pathNodeTitleGroup = this.getIntParam(section, "pathNodeRegex.titleGroup");
            this.mPathExtractorList.add(new HtmlPathExtractor(prefix, pathStartRegex, pathEndRegex, pathNodeRegex, pathNodeUrlGroup, pathNodeTitleGroup));
        }
    }

    private int getIntParam(Map<String, String> configSection, String paramName) throws RegainException {
        String asString = configSection.get(paramName);
        if (asString == null) {
            throw new RegainException("Error in configuration for " + ((Object)((Object)this)).getClass().getName() + ": Preparator param '" + paramName + "' is not set");
        }
        asString = asString.trim();
        try {
            return Integer.parseInt(asString);
        }
        catch (NumberFormatException exc) {
            throw new RegainException("Error in configuration for " + ((Object)((Object)this)).getClass().getName() + ": Preparator param '" + paramName + "' is not a number: '" + asString + "'", (Throwable)exc);
        }
    }

    public void prepare(RawDocument rawDocument) throws RegainException {
        String headlines;
        String cuttedContent;
        String title = this.extractHtmlTitle(rawDocument.getContentAsString());
        this.setTitle(title);
        HtmlContentExtractor contentExtractor = null;
        if (this.mContentExtractorList != null) {
            for (HtmlContentExtractor iContextExtractor : this.mContentExtractorList) {
                if (!iContextExtractor.accepts(rawDocument)) continue;
                contentExtractor = iContextExtractor;
            }
        }
        boolean isContentCutted = false;
        if (contentExtractor == null) {
            if (mLog.isDebugEnabled()) {
                mLog.debug((Object)("No HTML content extractor is responsible for " + rawDocument.getUrl()));
            }
            cuttedContent = rawDocument.getContentAsString();
            headlines = null;
        } else {
            cuttedContent = contentExtractor.extractContent(rawDocument);
            headlines = contentExtractor.extractHeadlines(cuttedContent);
            if (!cuttedContent.equals(rawDocument.getContentAsString())) {
                isContentCutted = true;
            }
        }
        String cleanedContent = null;
        Page htmlPage = new Page(cuttedContent, "UTF-8");
        Parser parser = new Parser(new Lexer(htmlPage));
        StringBean stringBean = new StringBean();
        stringBean.setCollapse(true);
        stringBean.setLinks(false);
        stringBean.setReplaceNonBreakingSpaces(true);
        try {
            parser.visitAllNodesWith(stringBean);
            cleanedContent = stringBean.getStrings();
        }
        catch (ParserException ex) {
            throw new RegainException("Error while parsing content: ", (Throwable)ex);
        }
        this.setCleanedContent(cleanedContent);
        LinkVisitor linkVisitor = new LinkVisitor();
        if (isContentCutted) {
            htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
            parser = new Parser(new Lexer(htmlPage));
        } else {
            parser.reset();
        }
        try {
            parser.visitAllNodesWith(linkVisitor);
            ArrayList<Tag> links = linkVisitor.getLinks();
            ArrayList<Tag> frames = linkVisitor.getFrames();
            if (linkVisitor.getBaseTag() != null) {
                htmlPage.setBaseUrl(((BaseHrefTag)linkVisitor.getBaseTag()).getBaseUrl());
            } else {
                htmlPage.setBaseUrl(rawDocument.getUrl());
            }
            mLog.debug((Object)("Set base URL to: " + htmlPage.getBaseUrl()));
            for (LinkTag linkTag : links) {
                String linkText;
                String string2 = CrawlerToolkit.removeAnchor((String)linkTag.extractLink());
                string2 = CrawlerToolkit.completeDirectory((String)string2);
                String string3 = linkText = linkTag.getLinkText() == null ? "" : linkTag.getLinkText();
                if (!linkTag.isHTTPLikeLink()) continue;
                rawDocument.addLink(string2, linkText);
            }
            for (FrameTag frameTag : frames) {
                String link = CrawlerToolkit.removeAnchor((String)frameTag.getFrameLocation());
                link = CrawlerToolkit.completeDirectory((String)link);
                rawDocument.addLink(link, "frame");
            }
        }
        catch (ParserException ex) {
            throw new RegainException("Error while extracting links: ", (Throwable)ex);
        }
        if (headlines != null) {
            headlines = CrawlerToolkit.replaceHtmlEntities((String)headlines);
            this.setHeadlines(headlines);
        }
        HtmlPathExtractor pathExtractor = null;
        if (this.mPathExtractorList != null) {
            for (HtmlPathExtractor iPathExtractor : this.mPathExtractorList) {
                if (!iPathExtractor.accepts(rawDocument)) continue;
                pathExtractor = iPathExtractor;
            }
        }
        if (pathExtractor != null) {
            PathElement[] path = pathExtractor.extractPath(rawDocument);
            this.setPath(path);
        }
    }

    private String extractHtmlTitle(String content) {
        String TITLE_START_TAG = "<title>";
        int pos = -1;
        int startPos = -1;
        while ((pos = content.indexOf(60, pos + 1)) != -1) {
            if (this.isIndexOf(content, "<title>", pos)) {
                startPos = pos + "<title>".length();
                break;
            }
            if (!this.isIndexOf(content, "<body", pos)) continue;
        }
        if (startPos != -1) {
            pos = startPos - 1;
            while ((pos = content.indexOf(60, pos + 1)) != -1) {
                if (this.isIndexOf(content, "</title>", pos)) {
                    return content.substring(startPos, pos);
                }
                if (pos <= startPos + 1000) continue;
                break;
            }
        }
        return null;
    }

    private boolean isIndexOf(String content, String expected, int pos) {
        if (content.length() < pos + expected.length()) {
            return false;
        }
        String substring = content.substring(pos, pos + expected.length());
        return expected.equalsIgnoreCase(substring);
    }
}

