Friday, September 05, 2008

http bot crawler


import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;

private void runCrawl(String url, String file)throws Exception{
URL crawlURL = new URL(url);
URLConnection yc = crawlURL.openConnection();

//sun.misc.BASE64Encoder encoder = new sun.misc.BASE64Encoder();
//String encodedUserPwd = encoder.encode("kg_b05"+":"+"*****");
//yc.setRequestProperty("Proxy-Authorization", "Basic " +encodedUserPwd);

InputStream i = yc.getInputStream();
FileOutputStream fos = new FileOutputStream(file);
int in=i.read();
while(in!=-1){
fos.write(in);
in=i.read();
}
fos.flush();
fos.close();
}

private String readFile(String file) throws Exception{
String str = "", html = "";
try {
BufferedReader in = new BufferedReader(new FileReader(file));

while ((str = in.readLine()) != null) {
html += str;
}
in.close();
} catch (IOException e) {
throw e;
}

return html;
}

public MegaVO doMegaTask() throws Exception{


runCrawl(SC_MEGA_URL, SC_MEGA_FILE);
String html = readFile(SC_MEGA_FILE);
Parser parser = new Parser();
parser.setInputHTML(html);

NodeFilter nf = new TagNameFilter("span");
NodeList spans = parser.parse (nf); // here is your two node list

Node span = null;
Node child = null;
String num = "", date = "", d1 = "", d2 = "", d3 = "", d4 = "", d5 = "", mm = "";
for(int i=0; i < spans.size(); i++){
span = spans.elementAt(i);
//if(span.getText().equals("span id=\"DrawResults2_lblMMNum1\"")){
String text = span.getText();

if(text.length() > 27){
if(text.substring(0,27).equals("span id=\"DrawResults2_lblMM")){ // 27 characters deep
String id = text.substring(27,text.length()-1);
child = span.getFirstChild();

if(id.equals("DrawNum")){
logger.info(child.getText());
num = child.getText();
}else if(id.equals("DrawDate")){
logger.info(child.getText());
date = child.getText();
}else if(id.equals("Mega")){
logger.info(child.getText());
mm = child.getText();
}else{
String n = id.substring(id.length()-1);
switch((new Integer(n)).intValue()){
case 1:
d1 = child.getText();
break;
case 2:
d2 = child.getText();
break;
case 3:
d3 = child.getText();
break;
case 4:
d4 = child.getText();
break;
case 5:
d5 = child.getText();
break;
}
logger.info(id);
logger.info(child.getText());
}
}
}
}

MegaVO vo = new MegaVO(num,date,d1,d2,d3,d4,d5,mm);
logger.info(vo.toString());
return vo;
}

No comments: