/*
 * Decompiled with CFR 0.152.
 */
package org.semanticdesktop.aperture.extractor.plaintext;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.mozilla.universalchardet.UniversalDetector;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.semanticdesktop.aperture.extractor.Extractor;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.rdf.RDFContainer;
import org.semanticdesktop.aperture.util.IOUtil;
import org.semanticdesktop.aperture.util.UtfUtil;
import org.semanticdesktop.aperture.vocabulary.NFO;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PlainTextExtractor
implements Extractor {
    public static final int BYTES_TEST_LENGTH = 16384;
    private static final int STRING_TEST_LENGTH = 256;
    private static Logger logger = LoggerFactory.getLogger(PlainTextExtractor.class);
    private String notifiedCharsetName;

    public static Charset guessCharset(InputStream iStream) throws IOException {
        if (!iStream.markSupported()) {
            throw new IOException("The stream must support mark()");
        }
        iStream.mark(16384);
        Charset charset = PlainTextExtractor.findCharsetByBom(iStream);
        if (charset == null) {
            charset = PlainTextExtractor.findCharsetByJUniversalChardet(iStream);
            iStream.reset();
        }
        return charset;
    }

    public void extract(URI id, InputStream iStream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
        try {
            if (charset == null) {
                if (!iStream.markSupported()) {
                    iStream = new BufferedInputStream(iStream, 16384);
                }
                charset = PlainTextExtractor.guessCharset(iStream);
            }
            InputStreamReader convertingReader = charset == null ? new InputStreamReader(iStream) : new InputStreamReader(iStream, charset);
            PushbackReader reader = new PushbackReader(convertingReader, 256);
            String firstChars = IOUtil.readString(reader, 256);
            int nrChars = firstChars.length();
            for (int i = 0; i < nrChars; ++i) {
                char c = firstChars.charAt(i);
                if (Character.isDefined(c) && (!Character.isISOControl(c) || Character.isWhitespace(c))) continue;
                logger.warn("Document does not contain plain text");
                return;
            }
            reader.unread(firstChars.toCharArray());
            String text = IOUtil.readString(reader);
            if (text.length() > 0) {
                result.add(RDF.type, NFO.PlainTextDocument);
                result.add(NIE.plainTextContent, text);
            }
        }
        catch (IOException e) {
            throw new ExtractorException(e);
        }
    }

    private static Charset findCharsetByJUniversalChardet(InputStream iStream) throws IOException {
        UniversalDetector detector = new UniversalDetector(null);
        byte[] buffer = new byte[4096];
        int read = 0;
        detector.reset();
        for (int totalRead = 0; totalRead < 16384 && (read = iStream.read(buffer, 0, 16384 - totalRead >= buffer.length ? buffer.length : 16384 - totalRead)) >= 0; totalRead += read) {
            detector.handleData(buffer, 0, read);
        }
        detector.dataEnd();
        Charset result = null;
        String chset = detector.getDetectedCharset();
        if (chset != null) {
            try {
                result = Charset.forName(chset);
            }
            catch (UnsupportedCharsetException e) {
                logger.info("Unsupported charset, trying to continue with current charset", e);
            }
        }
        return result;
    }

    private static Charset findCharsetByBom(InputStream iStream) throws IOException {
        String charsetName;
        Charset result = null;
        byte[] firstBytes = IOUtil.readBytes(iStream, 4);
        byte[] bomBytes = UtfUtil.findMatchingBOM(firstBytes);
        iStream.reset();
        if (bomBytes != null && (charsetName = UtfUtil.getCharsetName(bomBytes)) != null) {
            try {
                result = Charset.forName(charsetName);
                for (int i = 0; i < bomBytes.length; ++i) {
                    iStream.read();
                }
            }
            catch (UnsupportedCharsetException e) {
                logger.info("Unsupported charset, trying to continue with current charset", e);
            }
        }
        return result;
    }
}

