/*
 * Decompiled with CFR 0.152.
 */
package com.openkm.extractor;

import com.openkm.core.Config;
import com.openkm.extractor.AbbyTextExtractor;
import com.openkm.extractor.CuneiformTextExtractor;
import com.openkm.extractor.RegisteredExtractors;
import com.openkm.extractor.Tesseract3TextExtractor;
import com.openkm.util.FileUtils;
import java.io.BufferedInputStream;
import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.jackrabbit.extractor.AbstractTextExtractor;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PdfTextExtractor
extends AbstractTextExtractor {
    private static final Logger log = LoggerFactory.getLogger(PdfTextExtractor.class);

    public PdfTextExtractor() {
        super(new String[]{"application/pdf"});
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
        try {
            CharArrayWriter writer;
            PDFParser parser;
            block20: {
                StringReader stringReader;
                parser = new PDFParser((InputStream)new BufferedInputStream(stream));
                try {
                    parser.parse();
                    PDDocument document = parser.getPDDocument();
                    writer = new CharArrayWriter();
                    PDFTextStripper stripper = new PDFTextStripper();
                    stripper.setLineSeparator("\n");
                    stripper.writeText(document, (Writer)writer);
                    String st = writer.toString().trim();
                    log.debug("TextStripped: '{}'", (Object)st);
                    if (!Config.SYSTEM_PDF_FORCE_OCR && st.length() > 1) break block20;
                    log.warn("PDF does not contains text layer");
                    List pages = document.getDocumentCatalog().getAllPages();
                    StringBuilder sb = new StringBuilder();
                    long timeout = 600000L;
                    long startTime = new Date().getTime();
                    for (PDPage page : pages) {
                        PDResources resources = page.getResources();
                        Map images = resources.getImages();
                        if (images == null) continue;
                        Iterator itImg = images.keySet().iterator();
                        while (itImg.hasNext()) {
                            if (new Date().getTime() - startTime > timeout) {
                                throw new IOException("timeout expired");
                            }
                            String key = (String)itImg.next();
                            PDXObjectImage image = (PDXObjectImage)images.get(key);
                            if (key.length() < 3) {
                                key = key.concat(RandomStringUtils.randomAlphabetic((int)2));
                            }
                            File pdfImg = File.createTempFile(key, "." + image.getSuffix());
                            log.debug("Writing image: {}", (Object)pdfImg.getPath());
                            image.write2file(pdfImg);
                            String txt = this.doOcr(pdfImg);
                            sb.append(txt).append(" ");
                            log.debug("OCR Extracted: {}", (Object)txt);
                            FileUtils.deleteQuietly(pdfImg);
                        }
                    }
                    stringReader = new StringReader(sb.toString());
                }
                catch (Throwable throwable) {
                    try {
                        try {
                            PDDocument doc = parser.getPDDocument();
                            if (doc == null) throw throwable;
                            doc.close();
                            throw throwable;
                        }
                        catch (IOException e) {
                            // empty catch block
                        }
                        throw throwable;
                    }
                    catch (Exception e) {
                        log.warn("Failed to extract PDF text content", (Throwable)e);
                        StringReader stringReader2 = new StringReader("");
                        return stringReader2;
                    }
                }
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc == null) return stringReader;
                    doc.close();
                    return stringReader;
                }
                catch (IOException e) {
                    // empty catch block
                }
                return stringReader;
            }
            CharArrayReader charArrayReader = new CharArrayReader(writer.toCharArray());
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc == null) return charArrayReader;
                doc.close();
                return charArrayReader;
            }
            catch (IOException e) {
                // empty catch block
            }
            return charArrayReader;
        }
        finally {
            stream.close();
        }
    }

    private String doOcr(File pdfImg) throws Exception {
        String text = "";
        if (RegisteredExtractors.isRegistered(CuneiformTextExtractor.class.getCanonicalName())) {
            text = new CuneiformTextExtractor().doOcr(pdfImg);
        } else if (RegisteredExtractors.isRegistered(Tesseract3TextExtractor.class.getCanonicalName())) {
            text = new Tesseract3TextExtractor().doOcr(pdfImg);
        } else if (RegisteredExtractors.isRegistered(AbbyTextExtractor.class.getCanonicalName())) {
            text = new AbbyTextExtractor().doOcr(pdfImg);
        } else {
            log.warn("No OCR engine configured");
        }
        return text;
    }

    static {
        PDFParser.class.getName();
    }
}

