/* * ShrinkerServlet * * For use of google/the web through phone XHTML browser with limited memory. * * bakert+miniweb@google.com, 2003-12-05 */ package net.bluebones.miniweb; import com.google.soap.search.GoogleSearch; import com.google.soap.search.GoogleSearchResult; import com.google.soap.search.GoogleSearchResultElement; import com.google.soap.search.GoogleSearchFault; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.InputStreamReader; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; public class ShrinkerServlet extends HttpServlet { /* XXX does not deal with redirects. XXX problem with HTML-within-SCRIPT at http://www.itv.com/popidol/ XXX Third party lib for Html2Text like "Sherlock Holmes" or other? */ private boolean _rewriteLinks = true; public static final int RESULTS_PER_PAGE = 3; public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException { response.setContentType("text/html;charset=UTF-8"); PrintWriter out = response.getWriter(); try { String cmd = (request.getParameter("cmd") == null ? "" : request.getParameter("cmd")); int startAt = (request.getParameter("startAt") == null ? 0 : Integer.parseInt(request.getParameter("startAt"))); if (cmd.equals("search")) { out.print(getResults((String) request.getParameter("q"), startAt)); } else if (cmd.equals("show")) { out.print(getPage((String) request.getParameter("url"), ((String) request.getParameter("preferGoogle") ).equals("true") ? true : false)); } else if (cmd.equals("url")) { out.print(getUrlForm(request.getContextPath() + request.getPathInfo())); } else { out.print(getSearchForm(request.getContextPath() + request.getPathInfo())); } } catch (Exception e) { Log.log(e); out.print("

Exception on this page: " + e + "

"); } } public String getUrlForm(String path) { return getForm("show", "url", path, false); } public String getSearchForm(String path) { return getForm("search", "q", path, true); } public String getForm(String cmd, String inputId, String path, boolean preferGoogle) { // Make this as small as possible? (No name Parameter, no space before // forward slash, no BR, no value for submit.) final String BR = System.getProperty("line.separator"); StringBuffer sb = new StringBuffer(); sb.append("
" + BR); sb.append("" + BR); sb.append("" + BR); sb.append("" + BR); sb.append(" " + BR); sb.append("
" + BR); return sb.toString(); } public String getResults(String query, int startAt) { final String BR = System.getProperty("line.separator"); StringBuffer sb = new StringBuffer(); // search for query in google GoogleSearch search = new GoogleSearch(); search.setKey("Xec0m+da23jiVXkYASXbZuQeYolQazFp"); search.setQueryString(query); search.setMaxResults(ShrinkerServlet.RESULTS_PER_PAGE + 1); search.setStartResult(startAt); try { GoogleSearchResult googleResult = search.doSearch(); GoogleSearchResultElement[] re = googleResult.getResultElements(); if (re.length == 0) { return "

No results

"; } for (int i = 0; i < ShrinkerServlet.RESULTS_PER_PAGE && i < re.length; i++) { // initialize as our kind of result Result result = new Result(re[i]); // add to sb sb.append("

" + result.getTitle() + "

"); } try { if (re.length == ShrinkerServlet.RESULTS_PER_PAGE + 1) { sb.append("

More

"); } } catch (UnsupportedEncodingException e) { // don't be stupid, it's hardcoded above. throw new RuntimeException("Can't happen: " + e, e); } } catch (GoogleSearchFault e) { return "

Google Error: " + e + "

"; } return sb.toString(); } public String getPage(String url, boolean preferGoogle) { boolean hasGoogleHeader = false; Log.log("URL is " + url); String page; if (preferGoogle) { Log.log("Trying google"); page = getCachedPage(url); hasGoogleHeader = true; } else { Log.log("Trying scrape"); page = scrapePage(url); } if (page == null && preferGoogle) { Log.log("Trying scrape 2"); page = scrapePage(url); } else if (page == null && ! preferGoogle) { Log.log("Trying google2"); page = getCachedPage(url); hasGoogleHeader = true; } if (page == null) { return "

Cannot get page.

"; } URL actualUrl = getUrl(url); if (actualUrl == null) { return "

Cannot get URL.

"; } // strip out everything return Shrinker.shrink(page, _rewriteLinks, hasGoogleHeader, actualUrl); } private String getCachedPage(String url) { // get page from google cache GoogleSearch search = new GoogleSearch(); search.setKey("Xec0m+da23jiVXkYASXbZuQeYolQazFp"); search.setSafeSearch(false); String page; try { page = new String(search.doGetCachedPage(url)); } catch (GoogleSearchFault e) { return null; } if (page.indexOf("Sorry, no content found for this URL") > -1) { return null; } return page; } private String scrapePage(String url) { URL finalUrl = getUrl(url); if (finalUrl == null) { return "Cannot get URL (scrapePage)"; } StringBuffer sb = new StringBuffer(); try { BufferedReader reader = new BufferedReader( new InputStreamReader(finalUrl.openStream())); String line; while ((line = reader.readLine()) != null) { sb.append(line + System.getProperty("line.separator")); } } catch (IOException e) { return null; } return sb.toString(); } private URL getUrl(String url) { try { return new URL(url.indexOf("://") > -1 ? url : "http://" + url + "/"); } catch (MalformedURLException e) { System.err.print(e.toString()); return null; } } public void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException { doGet(request, response); } } class Result { private String _title; private String _url; private String _snippet; public Result(String title, String url, String snippet) { _title = title; _url = url; _snippet = snippet; } public Result(GoogleSearchResultElement result) { this(result.getTitle(), result.getURL(), result.getSnippet()); } public String getTitle() { return _title; } public String getURL() { return _url; } public String getSnippet() { return _snippet; } } class Shrinker { private Shrinker() {} // XXX do we want preferGoogle here? public static String shrink(String html, boolean rewriteLinks, boolean hasGoogleHeader, URL url) { String newHtml = html; if (newHtml.indexOf("= 0) { Log.log("FRAMESET: this is a frameset. " + "Probably won't get any text"); } if (hasGoogleHeader) { final String googleEnd = "
"; int start = newHtml.indexOf(googleEnd) + googleEnd.length(); int end = newHtml.length(); if ((start > (-1 + googleEnd.length())) && (end > start)) { newHtml = newHtml.substring(start, end); } } Log.log("rewriteLinks is " + rewriteLinks); // line endings intentionally hardcoded (not "line.separator") to be // cross-platform at the server end (web page could come from any OS). newHtml = newHtml.replaceAll("\n", " "); newHtml = newHtml.replaceAll("\r", " "); // remove all HTML (except a tags if required) newHtml = newHtml.replaceAll("", ""); newHtml = newHtml.replaceAll("<[Hh][Ee][Aa][Dd]>.*?", ""); newHtml = newHtml.replaceAll( "<[Ss][Cc][Rr][Ii][Pp][Tt][^>]*>.*?", ""); newHtml = newHtml.replaceAll("<[Ss][Cc][Rr][Ii][Pp][Tt][^>]*>[^>]*>", ""); newHtml = newHtml.replaceAll( "<[Ss][Tt][Yy][Ll][Ee][^>]*>.*?", ""); newHtml = newHtml.replaceAll(" ", " "); newHtml = rewriteImgs(newHtml); if (rewriteLinks) { newHtml = newHtml.replaceAll("", "--LINKEND--"); } newHtml = newHtml.replaceAll("<[^>]+>", ""); if (rewriteLinks) { newHtml = newHtml.replaceAll("--LINKSTART--", ""); newHtml = rewriteLinks(newHtml, url); } // remove all unnecessary whitespace newHtml = newHtml.replaceAll("\\s", " "); while (newHtml.indexOf(" ") > -1) { newHtml = newHtml.replaceAll(" ", " "); } if (newHtml.length() == 0) { Log.log("ERROR: No text to return"); } else if (newHtml.length() > 15000) { //XXX better to look for last space before 15000 chars // or last close tag or something? Log.log("Truncated text from " + newHtml.length() + " to 15000 chars"); newHtml = newHtml.substring(0, 15000); } return newHtml; } private static String rewriteLinks(String html, URL url) { StringBuffer sb = new StringBuffer(); String oldHtml = html; int x = 0; int endMarker = 0; while (oldHtml.indexOf(" -1) { int linkStart = oldHtml.indexOf("", linkStart) + "".length(); try { String link = oldHtml.substring(linkStart, endMarker); String newLink = rewriteLink(link, url); sb.append(oldHtml.substring(0, linkStart) + newLink); oldHtml = oldHtml.substring(endMarker); } catch (StringIndexOutOfBoundsException e) { Log.log("linkStart is " + linkStart + " and endMarker is " + endMarker + " and have got this error: " + e); break; } } //sb.append(oldHtml.substring(endMarker)); return sb.toString(); } private static String rewriteLink(String aTag, URL url) { int textStart = aTag.indexOf(">") + ">".length(); int textEnd = aTag.indexOf("", textStart); if (textStart == -1 || textStart >= textEnd) { return ""; } String text = aTag.substring(textStart, textEnd); int start = aTag.indexOf("href=\"") + "href=\"".length(); int end = aTag.indexOf("\"", start); if (start == -1 || start >= end) { start = aTag.indexOf("href='") + "href='".length(); end = aTag.indexOf("'", start); } if (start == -1 || start >= end) { start = aTag.indexOf("href=") + "href=".length(); end = aTag.indexOf(" ", start); end = (end == -1 ? aTag.indexOf(">", start) : end); } if (start == -1 || start >= end) { Log.log("Cannot get link from '" + aTag.replaceAll("<", "<") + "' with start " + start + " and end " + end); return "-unreadable link-"; } String link = aTag.substring(start, end); if (text.trim().equals("")) { return "nolink"; } StringBuffer finalUrl = new StringBuffer(); if (link.indexOf("http") != 0) { finalUrl.append(url.getProtocol() + "://" + url.getHost()); if (url.getDefaultPort() != url.getPort() && url.getPort() != -1) { finalUrl.append(":" + url.getPort()); } if (link.charAt(0) != '/') { int lastSlash = url.getPath().lastIndexOf("/"); if (lastSlash != 0) { finalUrl.append("/"); } finalUrl.append(url.getPath().substring(0, lastSlash)); } } finalUrl.append(link.replaceAll("&", "&")); try { //XXX preferGoogle passed in or always false? or always true? return "" + text + ""; } catch (UnsupportedEncodingException e) { // don't be daft, it's hardcoded above. throw new RuntimeException(e.toString(), e); } } public static String rewriteImgs(String s) { return s.replaceAll("]*alt=\"([^\"]*)\"[^>]*>", "$1"); } } class Log { private static final String LOG = "miniweb.log"; private Log() {} public static void log(String s) { try { FileWriter fw = new FileWriter(LOG, true); PrintWriter prw = new PrintWriter(fw, true); prw.println(s); prw.flush(); prw.close(); } catch (FileNotFoundException e) { System.err.println(e.toString()); } catch (IOException e) { System.err.println(e.toString()); } } public static void log(Throwable t) { final String LINE = "====================================" + "===================================="; try { FileWriter fw = new FileWriter(LOG, true); PrintWriter prw = new PrintWriter(fw, true); prw.println(LINE); prw.println("=== Exception ======================" + "===================================="); prw.println(LINE); t.printStackTrace(prw); prw.println(LINE); prw.println(LINE); prw.println(LINE); prw.println(); prw.flush(); prw.close(); } catch (FileNotFoundException e) { System.err.println(e.toString()); } catch (IOException e) { System.err.println(e.toString()); } } }