/* WebCrawler.java * * Original from: http://java.sun.com/developer/technicalArticles/ThirdParty/WebCrawler/ * Modified by Frank McCown (Harding University) */ import java.applet.Applet; import java.awt.BorderLayout; import java.awt.Button; import java.awt.Choice; import java.awt.FlowLayout; import java.awt.Frame; import java.awt.Graphics; import java.awt.Label; import java.awt.Panel; import java.awt.TextField; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Hashtable; import java.util.StringTokenizer; import java.util.Vector; public class WebCrawler extends Applet implements ActionListener, Runnable { public static final String SEARCH = "Search"; public static final String STOP = "Stop"; public static final String CLEAR = "Clear"; public static final String USER_AGENT = "hardingbot"; private Panel panelMain; private java.awt.List listFrontier; private java.awt.List listVisited; private java.awt.Label labelStatus; private Vector frontier; // URLs to be searched private Hashtable seenUrls; // URLs already seen (already crawled or in frontier) private Vector visitedUrls; // URLs we have already visited private Thread searchThread; private TextField textURL; private Choice searchLimit; @Override public void init() { // set up the main UI panel panelMain = new Panel(); panelMain.setLayout(new BorderLayout(5, 5)); // text entry components Panel panelEntry = new Panel(); panelEntry.setLayout(new BorderLayout(5, 5)); Panel panelURL = new Panel(); panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5)); panelURL.add(new Label("Starting URL: ", Label.RIGHT)); textURL = new TextField("", 40); panelURL.add(textURL); panelEntry.add("North", panelURL); Panel panelType = new Panel(); panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5)); panelType.add(new Label("Search limit: ", Label.RIGHT)); searchLimit = new Choice(); searchLimit.addItem("1"); searchLimit.addItem("2"); searchLimit.addItem("3"); searchLimit.addItem("4"); searchLimit.addItem("5"); searchLimit.addItem("10"); searchLimit.addItem("25"); searchLimit.addItem("None"); searchLimit.select(4); panelType.add(searchLimit); panelEntry.add("South", panelType); panelMain.add("North", panelEntry); // list of result URLs Panel panelListButtons = new Panel(); panelListButtons.setLayout(new BorderLayout(5, 5)); Panel panelList = new Panel(); panelList.setLayout(new BorderLayout(5, 5)); panelList.add("North", new Label("Frontier")); Panel panelListCurrent = new Panel(); panelListCurrent.setLayout(new BorderLayout(5, 5)); listFrontier = new java.awt.List(10); panelListCurrent.add("North", listFrontier); labelStatus = new Label(""); panelList.add("Center", panelListCurrent); Panel visited = new Panel(); visited.setLayout(new BorderLayout(5, 5)); visited.add("North", new Label("Visited URLs")); listVisited = new java.awt.List(10); visited.add("Center", listVisited); visited.add("South", labelStatus); panelList.add("South", visited); panelListButtons.add("North", panelList); // control buttons Panel panelButtons = new Panel(); Button buttonSearch = new Button(SEARCH); buttonSearch.addActionListener(this); panelButtons.add(buttonSearch); Button buttonStop = new Button(STOP); buttonStop.addActionListener(this); panelButtons.add(buttonStop); Button buttonClear = new Button(CLEAR); buttonClear.addActionListener(this); panelButtons.add(buttonClear); panelListButtons.add("South", panelButtons); panelMain.add("South", panelListButtons); add(panelMain); setVisible(true); repaint(); // initialize search data structures frontier = new Vector(); seenUrls = new Hashtable(); visitedUrls = new Vector(); // set default for URL access URLConnection.setDefaultAllowUserInteraction(false); } @Override public void start() { } @Override public void stop() { if (searchThread != null) { setStatus("stopping..."); searchThread = null; } } @Override public void destroy() { } // Read robots.txt for this URL void readRobotsTxt(URL url) { String host = url.getHost(); // Form URL of the robots.txt file String robot = "http://" + host + "/robots.txt"; System.out.println("Checking for " + robot + " ..."); } // Determine if this URL is safe to read from robots.txt boolean robotSafe(URL url) { return true; } @Override public void paint(Graphics g) { //Draw a Rectangle around the applet's display area. g.drawRect(0, 0, getSize().width - 1, getSize().height - 1); panelMain.paint(g); panelMain.paintComponents(g); } public void run() { String strURL = textURL.getText(); int numberSearched = 0; if (strURL.length() == 0) { setStatus("ERROR: must enter a starting URL"); searchThread = null; return; } int search_limit = 99999; try { search_limit = Integer.parseInt(searchLimit.getItem(searchLimit.getSelectedIndex())); } catch (Exception e) { // "None" must be selected } // initialize search data structures clearAll(); frontier.addElement(strURL); listFrontier.add(strURL); seenUrls.put(strURL, ""); while (frontier.size() > 0 && (Thread.currentThread() == searchThread)) { // get the first element from the to be searched list strURL = (String) frontier.elementAt(0); setStatus("Accessing " + strURL); URL url; try { url = new URL(strURL); } catch (MalformedURLException e) { setStatus("ERROR: invalid URL " + strURL); break; } // Get URL off the frontier and mark as being visited frontier.removeElementAt(0); listFrontier.remove(0); visitedUrls.addElement(strURL); // Can only search http: protocol URLs if (url.getProtocol().compareTo("http") != 0) continue; // Make sure it is safe to crawl if (!robotSafe(url)) continue; try { System.out.println("Accessing [" + url + "] ..."); // try opening the URL URLConnection urlConnection = url.openConnection(); urlConnection.setRequestProperty("User-Agent", USER_AGENT); urlConnection.setAllowUserInteraction(false); // Only access pages returning 200 OK status String httpHeader = urlConnection.getHeaderField(null); if (httpHeader == null || !httpHeader.contains("200 OK")) { System.out.println("Did not receive 200 OK response."); continue; } //String type = urlConnection.getHeaderField("Content-Type"); String type = urlConnection.getContentType(); System.out.println("Content-Type: [" + type + "]"); /* Output HTTP header Map headerFields = urlConnection.getHeaderFields(); Iterator keyIter = headerFields.keySet().iterator(); while (keyIter.hasNext()) { String key = (String) keyIter.next(); java.util.List keyList = (java.util.List) headerFields.get(key); Iterator listIter = keyList.iterator(); while (listIter.hasNext()) { String listValue = (String) listIter.next(); System.err.println("Key = [" + key + "] value = [" + listValue + "]"); } } */ if (type == null) { System.out.println("Unable to determine resource MIME type."); continue; } if (!type.startsWith("text/html")) { System.out.println("Rejecting non-HTML resource."); continue; } // Mark as visited listVisited.add(strURL); String content = getPageContent(url); String lowerCaseContent = content.toLowerCase(); System.out.println("Looking for links..."); int index = 0; while ((index = lowerCaseContent.indexOf(""); String strLink = st.nextToken(); System.out.println("Found possible link: [" + strLink + "]"); URL urlLink; try { urlLink = new URL(url, strLink); strLink = urlLink.toString(); System.out.println("Full URL: [" + strLink + "]"); } catch (MalformedURLException e) { setStatus("ERROR: bad URL " + strLink); System.out.println(" Rejecting bad URL [" + strLink + "]"); continue; } // only look at http links if (urlLink.getProtocol().compareTo("http") != 0) { System.out.println(" Rejecting non-http link"); break; } String filename = urlLink.getFile(); if (filename.endsWith(".html") || filename.endsWith(".htm") || filename.endsWith("/") || filename.contains("?")) { // check to see if this URL has already been // searched or is going to be searched if (seenUrls.containsKey(strLink)) { System.out.println(" Seen before"); } else { // test to make sure it is robot-safe! if (robotSafe(urlLink)) { frontier.addElement(strLink); listFrontier.add(strLink); System.out.println(" Adding to frontier (" + frontier.size() + " URLs)"); } seenUrls.put(strLink, ""); } } else { System.out.println(" Non-HTML appearing link"); } } } catch (IOException e) { setStatus("ERROR: couldn't open URL " + strURL); continue; } catch (NullPointerException e) { // This could happen when calling HttpURLConnection.getHeaderField with // a bad URL setStatus("ERROR: couldn't open URL " + strURL); System.out.println("Bad URL: [" + strURL + "]"); continue; } numberSearched++; if (numberSearched >= search_limit) break; // Delay before accessing next URL try { System.out.println("Sleeping for 3 seconds..."); Thread.sleep(3000); } catch (Exception e) { System.err.println(e); } } if (numberSearched >= search_limit) { System.out.println("Reached search limit of " + search_limit + ". " + frontier.size() + " URLs left in frontier."); setStatus("Reached search limit of " + search_limit); } else { System.out.println("Frontier has " + frontier.size() + " items left."); setStatus("Done"); } searchThread = null; } private void setStatus(String status) { labelStatus.setText(status); } private String getPageContent(URL url) { System.out.println("Pulling out page content..."); String content = ""; try { InputStream urlStream = url.openStream(); byte b[] = new byte[1000]; int numRead = urlStream.read(b); content = new String(b, 0, numRead); while (numRead != -1) { //if (Thread.currentThread() != searchThread) // break; numRead = urlStream.read(b); if (numRead != -1) { String newContent = new String(b, 0, numRead); content += newContent; } } urlStream.close(); } catch (Exception e) { e.printStackTrace(); setStatus("ERROR: couldn't open URL " + url); } return content; } public void actionPerformed(ActionEvent event) { String command = event.getActionCommand(); if (command.compareTo(SEARCH) == 0) { setStatus("searching..."); // launch a thread to do the search if (searchThread == null) { searchThread = new Thread(this); } searchThread.start(); // Causes run() to execute } else if (command.compareTo(STOP) == 0) { stop(); } else if (command.compareTo(CLEAR) == 0) { clearAll(); } } private void clearAll() { // initialize search data structures frontier.removeAllElements(); seenUrls.clear(); visitedUrls.removeAllElements(); listFrontier.removeAll(); listVisited.removeAll(); } public static void main (String argv[]) { Frame f = new Frame("Web Crawler"); WebCrawler applet = new WebCrawler(); f.add("Center", applet); applet.init(); applet.start(); f.pack(); f.setVisible(true); } }