/* * ShrinkerServlet * * For use of google/the web through phone XHTML browser with limited memory. * * bakert+miniweb@google.com, 2003-12-05 */ package net.bluebones.miniweb; import com.google.soap.search.GoogleSearch; import com.google.soap.search.GoogleSearchResult; import com.google.soap.search.GoogleSearchResultElement; import com.google.soap.search.GoogleSearchFault; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.InputStreamReader; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; public class ShrinkerServlet extends HttpServlet { /* XXX does not deal with redirects. XXX problem with HTML-within-SCRIPT at http://www.itv.com/popidol/ XXX Third party lib for Html2Text like "Sherlock Holmes" or other? */ private boolean _rewriteLinks = true; public static final int RESULTS_PER_PAGE = 3; public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException { response.setContentType("text/html;charset=UTF-8"); PrintWriter out = response.getWriter(); try { String cmd = (request.getParameter("cmd") == null ? "" : request.getParameter("cmd")); int startAt = (request.getParameter("startAt") == null ? 0 : Integer.parseInt(request.getParameter("startAt"))); if (cmd.equals("search")) { out.print(getResults((String) request.getParameter("q"), startAt)); } else if (cmd.equals("show")) { out.print(getPage((String) request.getParameter("url"), ((String) request.getParameter("preferGoogle") ).equals("true") ? true : false)); } else if (cmd.equals("url")) { out.print(getUrlForm(request.getContextPath() + request.getPathInfo())); } else { out.print(getSearchForm(request.getContextPath() + request.getPathInfo())); } } catch (Exception e) { Log.log(e); out.print("
Exception on this page: " + e + "
"); } } public String getUrlForm(String path) { return getForm("show", "url", path, false); } public String getSearchForm(String path) { return getForm("search", "q", path, true); } public String getForm(String cmd, String inputId, String path, boolean preferGoogle) { // Make this as small as possible? (No name Parameter, no space before // forward slash, no BR, no value for submit.) final String BR = System.getProperty("line.separator"); StringBuffer sb = new StringBuffer(); sb.append("" + BR); return sb.toString(); } public String getResults(String query, int startAt) { final String BR = System.getProperty("line.separator"); StringBuffer sb = new StringBuffer(); // search for query in google GoogleSearch search = new GoogleSearch(); search.setKey("Xec0m+da23jiVXkYASXbZuQeYolQazFp"); search.setQueryString(query); search.setMaxResults(ShrinkerServlet.RESULTS_PER_PAGE + 1); search.setStartResult(startAt); try { GoogleSearchResult googleResult = search.doSearch(); GoogleSearchResultElement[] re = googleResult.getResultElements(); if (re.length == 0) { return "No results
"; } for (int i = 0; i < ShrinkerServlet.RESULTS_PER_PAGE && i < re.length; i++) { // initialize as our kind of result Result result = new Result(re[i]); // add to sb sb.append(""); } try { if (re.length == ShrinkerServlet.RESULTS_PER_PAGE + 1) { sb.append(""); } } catch (UnsupportedEncodingException e) { // don't be stupid, it's hardcoded above. throw new RuntimeException("Can't happen: " + e, e); } } catch (GoogleSearchFault e) { return "Google Error: " + e + "
"; } return sb.toString(); } public String getPage(String url, boolean preferGoogle) { boolean hasGoogleHeader = false; Log.log("URL is " + url); String page; if (preferGoogle) { Log.log("Trying google"); page = getCachedPage(url); hasGoogleHeader = true; } else { Log.log("Trying scrape"); page = scrapePage(url); } if (page == null && preferGoogle) { Log.log("Trying scrape 2"); page = scrapePage(url); } else if (page == null && ! preferGoogle) { Log.log("Trying google2"); page = getCachedPage(url); hasGoogleHeader = true; } if (page == null) { return "Cannot get page.
"; } URL actualUrl = getUrl(url); if (actualUrl == null) { return "Cannot get URL.
"; } // strip out everything return Shrinker.shrink(page, _rewriteLinks, hasGoogleHeader, actualUrl); } private String getCachedPage(String url) { // get page from google cache GoogleSearch search = new GoogleSearch(); search.setKey("Xec0m+da23jiVXkYASXbZuQeYolQazFp"); search.setSafeSearch(false); String page; try { page = new String(search.doGetCachedPage(url)); } catch (GoogleSearchFault e) { return null; } if (page.indexOf("Sorry, no content found for this URL") > -1) { return null; } return page; } private String scrapePage(String url) { URL finalUrl = getUrl(url); if (finalUrl == null) { return "Cannot get URL (scrapePage)"; } StringBuffer sb = new StringBuffer(); try { BufferedReader reader = new BufferedReader( new InputStreamReader(finalUrl.openStream())); String line; while ((line = reader.readLine()) != null) { sb.append(line + System.getProperty("line.separator")); } } catch (IOException e) { return null; } return sb.toString(); } private URL getUrl(String url) { try { return new URL(url.indexOf("://") > -1 ? url : "http://" + url + "/"); } catch (MalformedURLException e) { System.err.print(e.toString()); return null; } } public void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException { doGet(request, response); } } class Result { private String _title; private String _url; private String _snippet; public Result(String title, String url, String snippet) { _title = title; _url = url; _snippet = snippet; } public Result(GoogleSearchResultElement result) { this(result.getTitle(), result.getURL(), result.getSnippet()); } public String getTitle() { return _title; } public String getURL() { return _url; } public String getSnippet() { return _snippet; } } class Shrinker { private Shrinker() {} // XXX do we want preferGoogle here? public static String shrink(String html, boolean rewriteLinks, boolean hasGoogleHeader, URL url) { String newHtml = html; if (newHtml.indexOf("