[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
re: http parser
Jason asked that I forward to the group:
===================================================================================
Mike - Please forward on to the ajug list so everyone
can benefit (I
can't do it from work).
Thanks.. Jason
My bad. My memory is failing me. I haven't used
HttpClient at all. It
was actually NekoHTML I used
http://www.garshol.priv.no/download/xmltools/prod/NekoHTML.html.
You can
also use HttpUnit http://httpunit.sourceforge.net (as
Bill mentioned)
even though you don't necessarily plan to use it for
unit testing, it
turns out to have a reasonable API which may give you
what you want - in
fact, HttpUnit looks like it use NekoHTML under the
covers anyway
http://httpunit.sourceforge.net/doc/manual/installing.html#dependencies.
Here's how I used NekoHTML:
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
public class TestHTMLDOM {
public static void main(String[] argv) throws
Exception {
DOMParser parser = new DOMParser();
for (int i = 0; i < argv.length; i++) {
parser.parse(argv[i]);
Node firstFlightNumberNode =
findFirstFlightNumberNode(parser.getDocument());
// Once we have the first flight number
node, we can easily
navigate to
// the nodes we need
Collection collectionOfFlightAvailability
=
scrapeFlightAvailability(firstFlightNumberNode);
Iterator iter =
collectionOfFlightAvailability.iterator();
while (iter.hasNext()) {
FlightAvailability flightAvailability
=
(FlightAvailability)iter.next();
System.out.println(flightAvailability);
}
}
}
/**
* @param firstFlightNumberNode
*/
private static Collection
scrapeFlightAvailability(Node
flightNumberCommentRoot) {
//print(flightNumberCommentRoot, "");
//print(node.getNextSibling(), "");
Node departureAirportCommentRoot =
flightNumberCommentRoot.getNextSibling().getNextSibling().getNextSibling
().getNextSibling().getNextSibling();
Node arrivalAirportCommentRoot =
departureAirportCommentRoot.getNextSibling().getNextSibling().getNextSib
ling();
print (flightNumberCommentRoot, "");
print (departureAirportCommentRoot, "");
print (arrivalAirportCommentRoot, "");
Collection returnCollection = new ArrayList();
FlightAvailability flightAvailability = new
FlightAvailability();
flightAvailability.setBusinessAvailability("10/20");
flightAvailability.setFirstAvailability("15/20");
flightAvailability.setCoachAvailability("20/25");
flightAvailability.setOrigin(clean(departureAirportCommentRoot.getNextSi
bling().getNextSibling().getFirstChild().getNodeValue()));
flightAvailability.setDestination(clean(arrivalAirportCommentRoot.getNex
tSibling().getNextSibling().getFirstChild().getNodeValue()));
flightAvailability.setFlightNumber(clean(flightNumberCommentRoot.getNext
Sibling().getNextSibling().getFirstChild().getNextSibling().getFirstChil
d().getNodeValue()));
returnCollection.add(flightAvailability);
return returnCollection;
}
public static Node findFirstFlightNumberNode(Node
node) {
if (node.getNodeValue() != null) {
if (node.getNodeValue().indexOf("Flight
number") != -1) {
return node;
}
else {
return null;
}
}
else {
Node child = node.getFirstChild();
Node found = null;
while (child != null && found == null) {
found =
findFirstFlightNumberNode(child);
child = child.getNextSibling();
}
return found;
}
/*if (node.getNodeValue() != null) {
if (node.getNodeValue().indexOf("First class
availability") != -1) {
firstClassAvail =
clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu
e());
} else if (
node.getNodeValue().indexOf("Business
class availability") != -1) {
businessClassAvail =
clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu
e());
} else if (
node.getNodeValue().indexOf("Coach class
availability") != -1) {
coachClassAvail =
clean(node.getNextSibling().getNextSibling().getFirstChild().getNodeValu
e());
}
}
else {
Node child = node.getFirstChild();
while (child != null) {
find(child);
child = child.getNextSibling();
}
} */
}
private static String clean(String string) {
// Remove the newline, trailing and leading
spaces and
// spaces in the middle
StringBuffer sbuf = new StringBuffer
(string.trim());
int i = sbuf.indexOf("\n");
if (i != -1) {
int j = sbuf.indexOf("(");
if (j != -1)
sbuf.delete(i, j-1);
}
return sbuf.toString();
}
static void print(Node node, String indent) {
System.out.println(node.getNodeName()+"
"+node.getNodeValue());
Node child = node.getFirstChild();
while (child != null) {
print(child, indent+" ");
child = child.getNextSibling();
}
}
static private String firstClassAvail,
businessClassAvail,
coachClassAvail;
}
Here's how I used HttpUnit:
package com.delta.dcom.webdoctor.fpes;
import
com.delta.dcom.webdoctor.common.WebdoctorProperties;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebRequest;
import com.meterware.httpunit.WebResponse;
import com.meterware.httpunit.GetMethodWebRequest;
import com.meterware.httpunit.WebTable;
/**
* Tests the Flifo web application over HTTP. As
illustrated in the
following
* diagram, to the flifo web application the request
looks like any
other that
* comes from a regular browser. The response returned
from Flifo is
checked
* to see if it matches what was expected
(web-scraping). HttpUnit is
the
* technology that makes this all possible. For more
information on
HttpUnit,
* go to http://httpunit.sourceforge.net
* <p>
* <img src="doc-files/flifotest.jpg"/>
* <p>
* @author Jason Chambers
*/
public class FlifoTest extends TestCase {
/**
* Test Flifo functionality
*/
public void testFlifoFunctionality() throws
Exception {
loadProperties();
// create the conversation object which will
maintain state for
us
WebConversation wc = new WebConversation();
// Invoke the Flifo web application over HTTP
WebRequest request =
new GetMethodWebRequest(requestUrl);
request.setParameter("request", "main");
request.setParameter("flight_date",
requestFlightDate);
request.setParameter("flight_number",
requestFlightNumber);
WebResponse response =
wc.getResponse(request);
WebTable table[] = response.getTables();
WebTable flightInfoTable = table[1];
String
flifoString=flightInfoTable.getCellAsText(0,2);
if (flifoString.indexOf(expectArrivalCity) ==
-1) {
fail ("Could not find the arrival city
"+expectArrivalCity+"
in the response");
}
}
/**
* Loads the properties necessary for this test
*/
private void loadProperties() {
WebdoctorProperties props =
WebdoctorProperties.getInstance();
requestUrl =
props.getProperty("fpes.flifo.request.url");
requestFlightDate =
props.getProperty("fpes.flifo.request.flight.date");
requestFlightNumber =
props.getProperty("fpes.flifo.request.flight.number");
expectArrivalCity =
props.getProperty("fpes.flifo.expect.arrival.city");
}
private String requestUrl;
private String requestFlightDate;
private String requestFlightNumber;
private String expectArrivalCity;
/**
* Returns a TestSuite containing all Flifo tests
*/
public static Test suite() {
return new TestSuite(FlifoTest.class);
}
/**
* Auto-launches the test runner for this test
suite
*/
public static void main(String args[]) {
junit.textui.TestRunner.run(suite());
}
}
-----Original Message-----
From: Mike Barnes [mailto:mdb3624@yahoo.com]
Sent: Wednesday, May 12, 2004 9:05 PM
To: Jason Chambers
Subject: HTTPClient
Jason,
Thanks for the suggestion about using Jakarta
HTTPClient. I can not
figure out from the site how you convert the HTML into
an XML Document.
I think that I need XML so that I can parse out the
table tags easily.
The contents of the table are where I will find the
data that I am
looking for.
Thanks
Mike Barnes
__________________________________
Do you Yahoo!?
Yahoo! Movies - Buy advance tickets for 'Shrek 2'
http://movies.yahoo.com/showtimes/movie?mid=1808405861
__________________________________
Do you Yahoo!?
SBC Yahoo! - Internet access at a great low price.
http://promo.yahoo.com/sbc/