package smart_gs.transcription_tool.tesseract;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HocrParser {

	public static void main(String[] args) {
		try {
			TesseractRunner r = new TesseractRunner();
			r.exec(args[0], args[1], args[2]);
			HocrParser parser = new HocrParser();
			HocrDocument doc = parser.parse(args[1] + ".html");
			for(HocrLine line : doc.getLines()) {
				System.out.println(line.getBbox());
			}
		} catch (TesseractRuntimeException e) {
			e.printStackTrace();
		} catch (HocrParseException e) {
			e.printStackTrace();
		}
	}

	public HocrDocument parse(File hocrFile) throws HocrParseException {
		HocrDocument hocrDoc = new HocrDocument();
		try {
			Document doc = Jsoup.parse(hocrFile, "utf-8");			
			Elements lineElems = doc.select(".ocr_line");
			List<HocrLine> lines = new ArrayList<HocrLine>();
			for (Element lineElem : lineElems) {
				HocrLine line = new HocrLine();
				line.setId(lineElem.attr("id"));
				BBox bbox = this.parseBBoxStr(lineElem.attr("title"));
				line.setBbox(bbox);
				List<HocrWord> innerWords = this.getHocrWords(lineElem);
				line.setWords(innerWords);
				lines.add(line);
			}
			hocrDoc.setLines(lines);
		} catch (ParseException e) {
			throw new HocrParseException("Parsing HOCR document failed.");
		} catch (IOException e) {
			throw new HocrParseException("Specified HOCR file not found.");
		}
		return hocrDoc;
	}

	public HocrDocument parse(String hocrFilePath) throws HocrParseException {
		return this.parse(new File(hocrFilePath));
	}

	private List<HocrWord> getHocrWords(Element lineElem) throws ParseException {
		Elements wordElems = lineElem.getElementsByClass("ocrx_word");
		List<HocrWord> words = new ArrayList<HocrWord>();
		for (Element wordElem : wordElems) {
			HocrWord word = new HocrWord();
			word.setId(wordElem.attr("id"));
			word.setBbox(this.parseBBoxStr(wordElem.attr("title")));
			word.setWord(wordElem.text());
			words.add(word);
		}
		return words;

	}

	private BBox parseBBoxStr(String str) throws ParseException {
		BBox bbox = null;
		String regex = "bbox (\\d+) (\\d+) (\\d+) (\\d+)";
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(str);
		if (m.find()) {
			int x0 = Integer.parseInt(m.group(1));
			int y0 = Integer.parseInt(m.group(2));
			int x1 = Integer.parseInt(m.group(3));
			int y1 = Integer.parseInt(m.group(4));
			bbox = new BBox(x0, y0, x1, y1);
		} else {
			throw new ParseException("Parsing bbox attr failed:" + str, 0);
		}
		return bbox;
	}

}
