/* * getNLPResult.java * * Created on May 25, 2008, 6:23 PM * * To change this template, choose Tools | Template Manager * and open the template in the editor. */ package edu.uiuc.lis.herbis; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.io.IOException; import java.io.*; import java.util.Properties; import de.gamta.Annotation; import de.gamta.MutableAnnotation; import de.gamta.Tokenizer; import de.gamta.Gamta; import de.htmlXmlUtil.Parser; import de.htmlXmlUtil.TreeNode; import de.gamta.defaultImplementation.RegExTokenizer; import de.goldenGate.GoldenGATE; import de.easyIO.help.HelpChapter; import de.gamta.QueriableAnnotation; import de.goldenGate.DocumentEditor; import de.goldenGate.resourceManagement.AbstractDocumentIO; import de.goldenGate.resources.DocumentFormat; import de.goldenGate.resources.DocumentFormatProvider; import de.goldenGate.resources.DocumentSaveOperation; import de.goldenGate.resources.DocumentSaver; import de.stringUtils.accessories.StringSelector; import de.easyIO.help.HelpChapter; import de.easyIO.settings.Settings; import de.gamta.DocumentRoot; import de.gamta.Gamta; import de.gamta.MutableAnnotation; import de.gamta.QueriableAnnotation; import de.goldenGate.DocumentEditor; import de.goldenGate.GoldenGATE; import de.goldenGate.resources.DocumentFormat; import de.goldenGate.resources.DocumentFormatProvider; import de.goldenGate.resources.GoldenGatePluginDataProvider; import de.goldenGate.resources.SettingsPanel; import de.goldenGate.util.DataListListener; import de.goldenGate.util.DataListPanel; import de.goldenGate.util.EditFontsButton; import de.goldenGate.util.FontEditable; import de.goldenGate.util.FontEditorDialog; import de.goldenGate.util.FontEditorPanel; import de.goldenGate.util.ResourceDialog; import de.htmlXmlUtil.Parser; import de.htmlXmlUtil.exceptions.ParseException; import de.htmlXmlUtil.grammars.Grammar; import de.htmlXmlUtil.grammars.Html; import de.htmlXmlUtil.grammars.StandardGrammar; import de.stringUtils.StringVector; import de.gamta.util.SgmlDocumentReader; /** * * @author qinwei */ public class getNLPResult { private String HMM_URL = "http://www3.isrl.uiuc.edu/~TeleNature/Herbis/src/" + "web/cgi-bin/tagger.cgi?modelno=1&rawlabel="; public InputStream stream; /** Creates a new instance of getNLPResult */ public getNLPResult() { } public String Parse(String ocr) { try { String ocrEncoded = URLEncoder.encode(ocr,"UTF-8"); String herbisURL = HMM_URL + ocrEncoded; URL herbisFetchURL = new URL(herbisURL); URLConnection conn = herbisFetchURL.openConnection(); InputStreamReader herbisReader = new InputStreamReader((InputStream)conn.getContent()); BufferedReader herbisln = new BufferedReader(herbisReader); String line = herbisln.readLine(); StringBuffer herbisResults = new StringBuffer(); while(line != null) { herbisResults.append(line).append('\n'); line = herbisln.readLine(); } herbisln.close(); return herbisResults.toString(); } catch(IOException e) { return null; } } public void process(MutableAnnotation data, Properties parameters) { try{ String input = data.getValue(); String output = Parse(input); File file = new File("temp.xml"); System.out.println("Herbis NLP output is "); System.out.println(output); output = output.substring(output.indexOf("")); PrintStream ps = new PrintStream(new FileOutputStream(file)); ps.print(output); ps.flush(); ps.close(); Parser parser = new Parser(); StandardGrammar grammar = new StandardGrammar(); DocumentRoot document = Gamta.newDocument(Gamta.INNER_PUNCTUATION_TOKENIZER); SgmlDocumentReader dc = new SgmlDocumentReader(document, grammar, null, null, null); document = dc.readDocument(file); // parser.stream(document, dc); dc.close(); Annotation[] Candidates = document.getAnnotations(); for (int i=0; i< Candidates.length; i++) { String value = Candidates[i].getValue(); int size = value.split("\\s+").length; data.addAnnotation(Candidates[i].getType(), Candidates[i].getStartIndex(), Candidates[i].size()); } // DocumentRoot document = Gamta.newDocument(Gamta.INNER_PUNCTUATION_TOKENIZER); // Grammar GRAMMAR = new GenericGamtaGrammar(); // Parser PARSER = new Parser(GRAMMAR); // GenericGamtaXML reader = new GenericGamtaXML(document); // try { // PARSER.stream(new FileReader(file), reader); // reader.close(); // data = document; // System.out.println(data.toString()); // } catch (ParseException pe) { // throw new IOException("Document could not be read or parsed"); // } }catch (Exception e) { } } }