[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

re: http parser



Jason asked that I forward to the group:


===================================================================================

Mike - Please forward on to the ajug list so everyone
can benefit (I

can't do it from work).



Thanks.. Jason



My bad. My memory is failing me. I haven't used
HttpClient at all. It

was actually NekoHTML I used

http://www.garshol.priv.no/download/xmltools/prod/NekoHTML.html.
You can

also use HttpUnit http://httpunit.sourceforge.net (as
Bill mentioned)

even though you don't necessarily plan to use it for
unit testing, it

turns out to have a reasonable API which may give you
what you want - in

fact, HttpUnit looks like it use NekoHTML under the
covers anyway

http://httpunit.sourceforge.net/doc/manual/installing.html#dependencies.



Here's how I used NekoHTML:



import java.util.ArrayList;

import java.util.Collection;

import java.util.Iterator;



import org.cyberneko.html.parsers.DOMParser;

import org.w3c.dom.Document;

import org.w3c.dom.Node;



public class TestHTMLDOM {

	public static void main(String[] argv) throws
Exception {

		DOMParser parser = new DOMParser();

		for (int i = 0; i < argv.length; i++) {

			parser.parse(argv[i]);

			Node firstFlightNumberNode =

findFirstFlightNumberNode(parser.getDocument());

            

            // Once we have the first flight number
node, we can easily

navigate to 

            // the nodes we need

            

            Collection collectionOfFlightAvailability
=

scrapeFlightAvailability(firstFlightNumberNode);

            Iterator iter =
collectionOfFlightAvailability.iterator();

            while (iter.hasNext()) {

                FlightAvailability flightAvailability
=

(FlightAvailability)iter.next();

               
System.out.println(flightAvailability);

            }

		}

	}

    

	/**

	 * @param firstFlightNumberNode

	 */

	private static Collection
scrapeFlightAvailability(Node

flightNumberCommentRoot) {

        //print(flightNumberCommentRoot, "");

        //print(node.getNextSibling(), "");

        Node departureAirportCommentRoot =

flightNumberCommentRoot.getNextSibling().getNextSibling().getNextSibling

().getNextSibling().getNextSibling();       

        Node arrivalAirportCommentRoot =

departureAirportCommentRoot.getNextSibling().getNextSibling().getNextSib

ling();

        print (flightNumberCommentRoot, "");

        print (departureAirportCommentRoot, "");

        print (arrivalAirportCommentRoot, "");

		Collection returnCollection = new ArrayList();

        FlightAvailability flightAvailability = new

FlightAvailability();

       
flightAvailability.setBusinessAvailability("10/20");

       
flightAvailability.setFirstAvailability("15/20");

       
flightAvailability.setCoachAvailability("20/25");

 

flightAvailability.setOrigin(clean(departureAirportCommentRoot.getNextSi

bling().getNextSibling().getFirstChild().getNodeValue()));

 

flightAvailability.setDestination(clean(arrivalAirportCommentRoot.getNex

tSibling().getNextSibling().getFirstChild().getNodeValue()));

 

flightAvailability.setFlightNumber(clean(flightNumberCommentRoot.getNext

Sibling().getNextSibling().getFirstChild().getNextSibling().getFirstChil

d().getNodeValue()));

        

        returnCollection.add(flightAvailability);

        return returnCollection;

		

	}



	public static Node findFirstFlightNumberNode(Node
node) {

        if (node.getNodeValue() != null) {

            if (node.getNodeValue().indexOf("Flight
number") != -1) {

                return node;

            }

            else {

                return null;

            }

        }

        else {

            Node child = node.getFirstChild();

            Node found = null;

            while (child != null && found == null) {

                found =
findFirstFlightNumberNode(child);

                child = child.getNextSibling();

            }

            return found; 

        }

		/*if (node.getNodeValue() != null) {

			if (node.getNodeValue().indexOf("First class

availability") != -1) {

				firstClassAvail =

clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu

e());

			} else if (

				node.getNodeValue().indexOf("Business

class availability") != -1) {

				businessClassAvail =

clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu

e());

			} else if (

				node.getNodeValue().indexOf("Coach class

availability") != -1) {

				coachClassAvail =

clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu

e());

			}

        }

        else {

		    Node child = node.getFirstChild();

		    while (child != null) {

			    find(child);

			    child = child.getNextSibling();

		    }

	    } */       

    }

    

	private static String clean(String string) {

        

        // Remove the newline, trailing and leading
spaces and 

        // spaces in the middle

        StringBuffer sbuf = new StringBuffer
(string.trim());

        int i = sbuf.indexOf("\n");

        if (i != -1) {

            int j = sbuf.indexOf("(");

            if (j != -1)

                sbuf.delete(i, j-1);

        }

        return sbuf.toString();

	}

    

    static void print(Node node, String indent) {

        System.out.println(node.getNodeName()+"
"+node.getNodeValue());

        Node child = node.getFirstChild();

        while (child != null) {

            print(child, indent+"   ");

            child = child.getNextSibling();

        }

    }

	static private String firstClassAvail,
businessClassAvail,

coachClassAvail;

}



Here's how I used HttpUnit:



package com.delta.dcom.webdoctor.fpes;

import
com.delta.dcom.webdoctor.common.WebdoctorProperties;

import junit.framework.TestCase;

import junit.framework.TestSuite;

import junit.framework.Test;

import com.meterware.httpunit.WebConversation;

import com.meterware.httpunit.WebRequest;

import com.meterware.httpunit.WebResponse;

import com.meterware.httpunit.GetMethodWebRequest;

import com.meterware.httpunit.WebTable;



/**

 * Tests the Flifo web application over HTTP. As
illustrated in the

following 

 * diagram, to the flifo web application the request
looks like any

other that 

 * comes from a regular browser. The response returned
from Flifo is

checked

 * to see if it matches what was  expected
(web-scraping). HttpUnit is

the 

 * technology that makes this all possible. For more
information on

HttpUnit, 

 * go to http://httpunit.sourceforge.net

 * <p>        

 * <img src="doc-files/flifotest.jpg"/>

 * <p>

 * @author    Jason Chambers

 */

public class FlifoTest extends TestCase {



    /**

     * Test Flifo functionality

     */

    public void testFlifoFunctionality() throws
Exception {

        

        loadProperties();



        // create the conversation object which will
maintain state for

us

        WebConversation wc = new WebConversation();

        

        // Invoke the Flifo web application over HTTP

        WebRequest request =

            new GetMethodWebRequest(requestUrl);

        request.setParameter("request", "main");

        request.setParameter("flight_date",
requestFlightDate);

        request.setParameter("flight_number",
requestFlightNumber);

        WebResponse response =
wc.getResponse(request);

        WebTable table[] = response.getTables();

        WebTable flightInfoTable = table[1];

        String
flifoString=flightInfoTable.getCellAsText(0,2);

        if (flifoString.indexOf(expectArrivalCity) ==
-1) {

            fail ("Could not find the arrival city
"+expectArrivalCity+"

in the response");

        }

    }



    /**

     * Loads the properties necessary for this test

     */

    private void loadProperties() {

        WebdoctorProperties props =
WebdoctorProperties.getInstance();

        requestUrl = 

           
props.getProperty("fpes.flifo.request.url");

        requestFlightDate = 

           
props.getProperty("fpes.flifo.request.flight.date");

        requestFlightNumber = 

           
props.getProperty("fpes.flifo.request.flight.number");

        expectArrivalCity = 

           
props.getProperty("fpes.flifo.expect.arrival.city");

    }

    

    private String requestUrl;

    private String requestFlightDate;

    private String requestFlightNumber;

    private String expectArrivalCity;



    /**

     * Returns a TestSuite containing all Flifo tests

     */

    public static Test suite() {

        return new TestSuite(FlifoTest.class);

    }

    

    /**

     * Auto-launches the test runner for this test
suite

     */

    public static void main(String args[]) {

        junit.textui.TestRunner.run(suite());

    }

}

-----Original Message-----

From: Mike Barnes [mailto:mdb3624@yahoo.com] 

Sent: Wednesday, May 12, 2004 9:05 PM

To: Jason Chambers

Subject: HTTPClient





Jason,



Thanks for the suggestion about using Jakarta

HTTPClient.  I can not 

figure out from the site how you convert the HTML into

an XML Document.



I think that I need XML so that I can parse out the

table tags easily. 

The contents of the table are where I will find the

data that I am 

looking for.



Thanks



Mike Barnes





	

		

__________________________________

Do you Yahoo!?

Yahoo! Movies - Buy advance tickets for 'Shrek 2'

http://movies.yahoo.com/showtimes/movie?mid=1808405861








	
		
__________________________________
Do you Yahoo!?
SBC Yahoo! - Internet access at a great low price.
http://promo.yahoo.com/sbc/