网页主动探测工具-使用Reactor模式
接前文
http://blog.itpub.net/29254281/viewspace-1344706/
http://blog.itpub.net/29254281/viewspace-1347985/
http://blog.itpub.net/29254281/viewspace-2134876/
之前的代码被大神怼了..被怒批 杂乱无章,误人子弟
我估计主要是因为面向对象的程度不够.过程化太明显了
在网上找了一个Reactor模式的例子,又改了改自己的程序.
因为Oracle太笨重了,这回干脆换了MySQL好了
改写之后的程序,在我的电脑上,使用 2线程,最大500连接的配置,性能最好。
importjava.io.IOException;importjava.net.InetSocketAddress;importjava.net.SocketAddress;importjava.nio.ByteBuffer;importjava.nio.channels.SelectionKey;importjava.nio.channels.Selector;importjava.nio.channels.SocketChannel;importjava.nio.charset.Charset;importjava.sql.Connection;importjava.sql.DriverManager;importjava.sql.PreparedStatement;importjava.sql.SQLException;importjava.sql.Timestamp;importjava.util.ArrayList;importjava.util.HashSet;importjava.util.Iterator;importjava.util.List;importjava.util.Set;importjava.util.concurrent.BlockingQueue;importjava.util.concurrent.LinkedBlockingQueue;importjava.util.concurrent.atomic.AtomicInteger;importjava.util.regex.Matcher;importjava.util.regex.Pattern;classReactorimplementsRunnable{publicstaticintGETCOUNT(){returnCOUNT.get();}publicstaticintgetQueueSize(){returnQUEUE.size();}privatestaticfinalAtomicIntegerCOUNT=newAtomicInteger();privatestaticfinalAtomicIntegerTASKCOUNT=newAtomicInteger();publicintstartTask(){returnTASKCOUNT.incrementAndGet();}publicintfinishTask(){returnTASKCOUNT.decrementAndGet();}publicintincrementAndGet(){returnCOUNT.incrementAndGet();}publicfinalSelectorselector;privatestaticBlockingQueue<Task>QUEUE=newLinkedBlockingQueue<Task>();publicvoidaddTask(Tasktask){try{QUEUE.put(task);}catch(InterruptedExceptione){e.printStackTrace();}}publicReactor()throwsIOException{selector=Selector.open();}@Overridepublicvoidrun(){try{while(!Thread.interrupted()){intmaxClient=500;Tasktask=null;if(TASKCOUNT.get()<maxClient){while((task=(Task)QUEUE.poll())!=null){newConnector(this,task).run();if(TASKCOUNT.get()>maxClient){break;}}}selector.select();Set<SelectionKey>selectionKeys=selector.selectedKeys();Iterator<SelectionKey>it=selectionKeys.iterator();//Selector如果发现channel有OP_ACCEPT或READ事件发生,下列遍历就会进行。while(it.hasNext()){//来一个事件第一次触发一个accepter线程//以后触发SocketReadHandlerSelectionKeyselectionKey=it.next();dispatch(selectionKey);}selectionKeys.clear();}}catch(IOExceptione){e.printStackTrace();}}/***运行Acceptor或SocketReadHandler**@paramkey*/voiddispatch(SelectionKeykey){Runnabler=(Runnable)(key.attachment());if(r!=null){r.run();}}}classConnectorimplementsRunnable{privateReactorreactor;privateTasktask;publicConnector(Reactorreactor,Tasktask){this.reactor=reactor;this.task=task;}@Overridepublicvoidrun(){try{reactor.startTask();task.setStarttime(System.currentTimeMillis());SocketAddressaddr=newInetSocketAddress(task.getHost(),80);SocketChannelsocketChannel=SocketChannel.open();socketChannel.configureBlocking(false);socketChannel.connect(addr);BaseHandlerbase=newBaseHandler();base.setTask(task);base.setSelector(reactor.selector);base.setSocketChannel(socketChannel);base.setReactor(reactor);if(socketChannel!=null)//调用Handler来处理channelsocketChannel.register(reactor.selector,SelectionKey.OP_CONNECT,newSocketWriteHandler(base));}catch(IOExceptione){e.printStackTrace();}}}classBaseHandler{privateSelectorselector;privateSocketChannelsocketChannel;privateTasktask;privateByteBufferbyteBuffer=ByteBuffer.allocate(2400);privateReactorreactor;publicReactorgetReactor(){returnreactor;}publicvoidsetReactor(Reactorreactor){this.reactor=reactor;}publicSelectorgetSelector(){returnselector;}publicvoidsetSelector(Selectorselector){this.selector=selector;}publicSocketChannelgetSocketChannel(){returnsocketChannel;}publicvoidsetSocketChannel(SocketChannelsocketChannel){this.socketChannel=socketChannel;}publicTaskgetTask(){returntask;}publicvoidsetTask(Tasktask){this.task=task;}publicByteBuffergetByteBuffer(){returnbyteBuffer;}}classSocketWriteHandlerimplementsRunnable{BaseHandlerbaseHandler;publicSocketWriteHandler(BaseHandlerbaseHandler){this.baseHandler=baseHandler;ByteBufferbyteBuffer=baseHandler.getByteBuffer();Tasktask=baseHandler.getTask();try{byteBuffer.put(("GET"+task.getCurrentPath()+"HTTP/1.0\r\n").getBytes("utf8"));byteBuffer.put(("HOST:"+task.getHost()+"\r\n").getBytes("utf8"));byteBuffer.put(("Accept:*/*\r\n").getBytes("utf8"));byteBuffer.put(("\r\n").getBytes("utf8"));byteBuffer.flip();}catch(IOExceptione){e.printStackTrace();}}@Overridepublicvoidrun(){try{while(!baseHandler.getSocketChannel().finishConnect()){System.out.println("WaitingConnected");}baseHandler.getSocketChannel().write(baseHandler.getByteBuffer());if(baseHandler.getByteBuffer().hasRemaining()){baseHandler.getByteBuffer().compact();baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_WRITE,this);System.out.println("ContinueWrite");}else{baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_READ,newSocketReadHandler(baseHandler));baseHandler.getByteBuffer().clear();}}catch(IOExceptione){e.printStackTrace();}}}classSocketReadHandlerimplementsRunnable{Charsetcharset=Charset.forName("utf8");Charsetgbkcharset=Charset.forName("gbk");BaseHandlerbaseHandler;publicSocketReadHandler(BaseHandlerbaseHandler){this.baseHandler=baseHandler;}@Overridepublicvoidrun(){try{SocketChannelchannel=baseHandler.getSocketChannel();ByteBufferbyteBuffer=baseHandler.getByteBuffer();Tasktask=baseHandler.getTask();intlength;while((length=channel.read(byteBuffer))>0){byteBuffer.flip();task.getContent().append(charset.decode(charset.encode(gbkcharset.decode(byteBuffer))).toString());byteBuffer.compact();}if(length==-1){channel.close();task.setEndtime(System.currentTimeMillis());baseHandler.getReactor().incrementAndGet();baseHandler.getReactor().finishTask();newParseHandler(task,baseHandler.getReactor()).run();}else{baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_READ,this);}}catch(IOExceptione){e.printStackTrace();}}}publicclassProbe{publicstaticvoidmain(String[]args)throwsIOException,InterruptedException{for(inti=0;i<2;i++){Reactorreactor=newReactor();reactor.addTask(newTask("news.163.com",80,"/index.html"));newThread(reactor,"ReactorThread_"+i).start();}longstart=System.currentTimeMillis();while(true){Thread.sleep(1000);longend=System.currentTimeMillis();floatinterval=((end-start)/1000);intconnectTotal=Reactor.GETCOUNT();intpersistenceTotal=PersistenceHandler.GETCOUNT();intconnectps=Math.round(connectTotal/interval);intpersistenceps=Math.round(persistenceTotal/interval);System.out.print("\r连接总数:"+connectTotal+"\t每秒连接:"+connectps+"\t连接队列剩余:"+Reactor.getQueueSize()+"\t持久化总数:"+persistenceTotal+"\t每秒持久化:"+persistenceps+"\t持久化队列剩余:"+PersistenceHandler.getInstance().getSize());}}}classTask{privateStringhost;privateintport;privateStringcurrentPath;privatelongstarttime;privatelongendtime;privateStringtype;privateStringBuildercontent=newStringBuilder(2400);privateintstate;privatebooleanisValid=true;publicTask(){}publicTask(Stringhost,intport,Stringpath){init(host,port,path);}publicvoidinit(Stringhost,intport,Stringpath){this.setCurrentPath(path);this.host=host;this.port=port;}publiclonggetStarttime(){returnstarttime;}publicvoidsetStarttime(longstarttime){this.starttime=starttime;}publiclonggetEndtime(){returnendtime;}publicvoidsetEndtime(longendtime){this.endtime=endtime;}publicbooleanisValid(){returnisValid;}publicvoidsetValid(booleanisValid){this.isValid=isValid;}publicintgetState(){returnstate;}publicvoidsetState(intstate){this.state=state;}publicStringgetCurrentPath(){returncurrentPath;}publicvoidsetCurrentPath(StringcurrentPath){this.currentPath=currentPath;inti=0;if(currentPath.indexOf("?")!=-1){i=currentPath.indexOf("?");}else{if(currentPath.indexOf("#")!=-1){i=currentPath.indexOf("#");}else{i=currentPath.length();}}this.type=currentPath.substring(currentPath.indexOf(".")+1,i);}publiclonggetTaskTime(){returngetEndtime()-getStarttime();}publicStringgetType(){returntype;}publicvoidsetType(Stringtype){this.type=type;}publicStringgetHost(){returnhost;}publicintgetPort(){returnport;}publicStringBuildergetContent(){returncontent;}publicvoidsetContent(StringBuildercontent){this.content=content;}}classParseHandlerimplementsRunnable{privatestaticfinalSetSET=newHashSet();PersistenceHandlerpersistencehandler=PersistenceHandler.getInstance();Listdomainlist=newArrayList();Tasktask;privateinterfaceFilter{voiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain);}privateclassFilterChainimplementsFilter{privateListlist=newArrayList();{addFilter(newTwoLevel());addFilter(newOneLevel());addFilter(newFullPath());addFilter(newRoot());addFilter(newDefault());}privatevoidaddFilter(Filterfilter){list.add(filter);}privateIteratorit=list.iterator();@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(it.hasNext()){((Filter)it.next()).doFilter(fatherTask,newTask,path,chain);}}}privateclassTwoLevelimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("../../")){Stringprefix=getPrefix(fatherTask.getCurrentPath(),3);newTask.init(fatherTask.getHost(),fatherTask.getPort(),path.replace("../../",prefix));}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassOneLevelimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("../")){Stringprefix=getPrefix(fatherTask.getCurrentPath(),2);newTask.init(fatherTask.getHost(),fatherTask.getPort(),path.replace("../",prefix));}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassFullPathimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("http://")){Iteratorit=domainlist.iterator();booleanflag=false;while(it.hasNext()){Stringdomain=(String)it.next();if(path.startsWith("http://"+domain+"/")){newTask.init(domain,fatherTask.getPort(),path.replace("http://"+domain+"/","/"));flag=true;break;}}if(!flag){newTask.setValid(false);}}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassRootimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("/")){newTask.init(fatherTask.getHost(),fatherTask.getPort(),path);}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassDefaultimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.contains(":")){newTask.setValid(false);return;}Stringprefix=getPrefix(fatherTask.getCurrentPath(),1);newTask.init(fatherTask.getHost(),fatherTask.getPort(),prefix+"/"+path);}}publicParseHandler(Tasktask,Reactorreactor){this.task=task;this.reactor=reactor;//增加白名单this.domainlist.add("news.163.com");}privateReactorreactor;privatePatternpattern=Pattern.compile("\"[^\"]+\\.htm[^\"]*\"");privatevoidparseTaskState(Tasktask){if(task.getContent().toString().startsWith("HTTP/1.1")){task.setState(Integer.parseInt(task.getContent().substring(9,12)));}else{task.setState(Integer.parseInt(task.getContent().substring(9,12)));}}/***@paramfatherTask*@parampath*@throwsException*/privatevoidcreateNewTask(TaskfatherTask,Stringpath)throwsException{TasknewTask=newTask();FilterChainfilterchain=newFilterChain();filterchain.doFilter(fatherTask,newTask,path,filterchain);if(newTask.isValid()){synchronized(SET){if(SET.contains(newTask.getHost()+newTask.getCurrentPath())){return;}SET.add(newTask.getHost()+newTask.getCurrentPath());}reactor.addTask(newTask);}}privateStringgetPrefix(Strings,intcount){Stringprefix=s;while(count>0){prefix=prefix.substring(0,prefix.lastIndexOf("/"));count--;}return"".equals(prefix)?"/":prefix;}@Overridepublicvoidrun(){try{parseTaskState(task);if(200==task.getState()){Matchermatcher=pattern.matcher(task.getContent());while(matcher.find()){Stringpath=matcher.group();if(!path.contains("")&&!path.contains("\t")&&!path.contains("(")&&!path.contains(")")){path=path.substring(1,path.length()-1);createNewTask(task,path);}}}persistencehandler.addTask(task);}catch(Exceptione){e.printStackTrace();}}}classPersistenceHandlerimplementsRunnable{privatestaticclassSingletonHandler{privatestaticPersistenceHandlerobj=newPersistenceHandler();}publicstaticPersistenceHandlergetInstance(){returnSingletonHandler.obj;}static{try{Class.forName("com.mysql.jdbc.Driver");}catch(ClassNotFoundExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}publicstaticintGETCOUNT(){returnCOUNT.get();}privatestaticfinalAtomicIntegerCOUNT=newAtomicInteger();privateBlockingQueuepersistencelist;publicPersistenceHandler(){this.persistencelist=newLinkedBlockingQueue();newThread(this,"PersistenceThread").start();}publicvoidaddTask(Tasktask){try{this.persistencelist.put(task);}catch(InterruptedExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}publicintgetSize(){returnpersistencelist.size();}privateConnectionconn;privatePreparedStatementps;@Overridepublicvoidrun(){try{conn=DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/mvbox","xx","xx");conn.setAutoCommit(false);ps=conn.prepareStatement("insertintoprobe(host,path,state,tasktime,type,length,createtime)values(?,?,?,?,?,?,?)");}catch(SQLExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}while(true){this.handler();COUNT.addAndGet(1);}}privatevoidhandler(){try{Tasktask=(Task)persistencelist.take();ps.setString(1,task.getHost());ps.setString(2,task.getCurrentPath());ps.setInt(3,task.getState());ps.setLong(4,task.getTaskTime());ps.setString(5,task.getType());ps.setInt(6,task.getContent().toString().length());ps.setTimestamp(7,newTimestamp(task.getEndtime()));ps.addBatch();if(GETCOUNT()%500==0){ps.executeBatch();conn.commit();}}catch(InterruptedExceptione){e.printStackTrace();}catch(SQLExceptione){e.printStackTrace();}}}
http://blog.itpub.net/29254281/viewspace-1344706/
http://blog.itpub.net/29254281/viewspace-1347985/
http://blog.itpub.net/29254281/viewspace-2134876/
之前的代码被大神怼了..被怒批 杂乱无章,误人子弟
我估计主要是因为面向对象的程度不够.过程化太明显了
在网上找了一个Reactor模式的例子,又改了改自己的程序.
因为Oracle太笨重了,这回干脆换了MySQL好了
改写之后的程序,在我的电脑上,使用 2线程,最大500连接的配置,性能最好。
importjava.io.IOException;importjava.net.InetSocketAddress;importjava.net.SocketAddress;importjava.nio.ByteBuffer;importjava.nio.channels.SelectionKey;importjava.nio.channels.Selector;importjava.nio.channels.SocketChannel;importjava.nio.charset.Charset;importjava.sql.Connection;importjava.sql.DriverManager;importjava.sql.PreparedStatement;importjava.sql.SQLException;importjava.sql.Timestamp;importjava.util.ArrayList;importjava.util.HashSet;importjava.util.Iterator;importjava.util.List;importjava.util.Set;importjava.util.concurrent.BlockingQueue;importjava.util.concurrent.LinkedBlockingQueue;importjava.util.concurrent.atomic.AtomicInteger;importjava.util.regex.Matcher;importjava.util.regex.Pattern;classReactorimplementsRunnable{publicstaticintGETCOUNT(){returnCOUNT.get();}publicstaticintgetQueueSize(){returnQUEUE.size();}privatestaticfinalAtomicIntegerCOUNT=newAtomicInteger();privatestaticfinalAtomicIntegerTASKCOUNT=newAtomicInteger();publicintstartTask(){returnTASKCOUNT.incrementAndGet();}publicintfinishTask(){returnTASKCOUNT.decrementAndGet();}publicintincrementAndGet(){returnCOUNT.incrementAndGet();}publicfinalSelectorselector;privatestaticBlockingQueue<Task>QUEUE=newLinkedBlockingQueue<Task>();publicvoidaddTask(Tasktask){try{QUEUE.put(task);}catch(InterruptedExceptione){e.printStackTrace();}}publicReactor()throwsIOException{selector=Selector.open();}@Overridepublicvoidrun(){try{while(!Thread.interrupted()){intmaxClient=500;Tasktask=null;if(TASKCOUNT.get()<maxClient){while((task=(Task)QUEUE.poll())!=null){newConnector(this,task).run();if(TASKCOUNT.get()>maxClient){break;}}}selector.select();Set<SelectionKey>selectionKeys=selector.selectedKeys();Iterator<SelectionKey>it=selectionKeys.iterator();//Selector如果发现channel有OP_ACCEPT或READ事件发生,下列遍历就会进行。while(it.hasNext()){//来一个事件第一次触发一个accepter线程//以后触发SocketReadHandlerSelectionKeyselectionKey=it.next();dispatch(selectionKey);}selectionKeys.clear();}}catch(IOExceptione){e.printStackTrace();}}/***运行Acceptor或SocketReadHandler**@paramkey*/voiddispatch(SelectionKeykey){Runnabler=(Runnable)(key.attachment());if(r!=null){r.run();}}}classConnectorimplementsRunnable{privateReactorreactor;privateTasktask;publicConnector(Reactorreactor,Tasktask){this.reactor=reactor;this.task=task;}@Overridepublicvoidrun(){try{reactor.startTask();task.setStarttime(System.currentTimeMillis());SocketAddressaddr=newInetSocketAddress(task.getHost(),80);SocketChannelsocketChannel=SocketChannel.open();socketChannel.configureBlocking(false);socketChannel.connect(addr);BaseHandlerbase=newBaseHandler();base.setTask(task);base.setSelector(reactor.selector);base.setSocketChannel(socketChannel);base.setReactor(reactor);if(socketChannel!=null)//调用Handler来处理channelsocketChannel.register(reactor.selector,SelectionKey.OP_CONNECT,newSocketWriteHandler(base));}catch(IOExceptione){e.printStackTrace();}}}classBaseHandler{privateSelectorselector;privateSocketChannelsocketChannel;privateTasktask;privateByteBufferbyteBuffer=ByteBuffer.allocate(2400);privateReactorreactor;publicReactorgetReactor(){returnreactor;}publicvoidsetReactor(Reactorreactor){this.reactor=reactor;}publicSelectorgetSelector(){returnselector;}publicvoidsetSelector(Selectorselector){this.selector=selector;}publicSocketChannelgetSocketChannel(){returnsocketChannel;}publicvoidsetSocketChannel(SocketChannelsocketChannel){this.socketChannel=socketChannel;}publicTaskgetTask(){returntask;}publicvoidsetTask(Tasktask){this.task=task;}publicByteBuffergetByteBuffer(){returnbyteBuffer;}}classSocketWriteHandlerimplementsRunnable{BaseHandlerbaseHandler;publicSocketWriteHandler(BaseHandlerbaseHandler){this.baseHandler=baseHandler;ByteBufferbyteBuffer=baseHandler.getByteBuffer();Tasktask=baseHandler.getTask();try{byteBuffer.put(("GET"+task.getCurrentPath()+"HTTP/1.0\r\n").getBytes("utf8"));byteBuffer.put(("HOST:"+task.getHost()+"\r\n").getBytes("utf8"));byteBuffer.put(("Accept:*/*\r\n").getBytes("utf8"));byteBuffer.put(("\r\n").getBytes("utf8"));byteBuffer.flip();}catch(IOExceptione){e.printStackTrace();}}@Overridepublicvoidrun(){try{while(!baseHandler.getSocketChannel().finishConnect()){System.out.println("WaitingConnected");}baseHandler.getSocketChannel().write(baseHandler.getByteBuffer());if(baseHandler.getByteBuffer().hasRemaining()){baseHandler.getByteBuffer().compact();baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_WRITE,this);System.out.println("ContinueWrite");}else{baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_READ,newSocketReadHandler(baseHandler));baseHandler.getByteBuffer().clear();}}catch(IOExceptione){e.printStackTrace();}}}classSocketReadHandlerimplementsRunnable{Charsetcharset=Charset.forName("utf8");Charsetgbkcharset=Charset.forName("gbk");BaseHandlerbaseHandler;publicSocketReadHandler(BaseHandlerbaseHandler){this.baseHandler=baseHandler;}@Overridepublicvoidrun(){try{SocketChannelchannel=baseHandler.getSocketChannel();ByteBufferbyteBuffer=baseHandler.getByteBuffer();Tasktask=baseHandler.getTask();intlength;while((length=channel.read(byteBuffer))>0){byteBuffer.flip();task.getContent().append(charset.decode(charset.encode(gbkcharset.decode(byteBuffer))).toString());byteBuffer.compact();}if(length==-1){channel.close();task.setEndtime(System.currentTimeMillis());baseHandler.getReactor().incrementAndGet();baseHandler.getReactor().finishTask();newParseHandler(task,baseHandler.getReactor()).run();}else{baseHandler.getSocketChannel().register(baseHandler.getSelector(),SelectionKey.OP_READ,this);}}catch(IOExceptione){e.printStackTrace();}}}publicclassProbe{publicstaticvoidmain(String[]args)throwsIOException,InterruptedException{for(inti=0;i<2;i++){Reactorreactor=newReactor();reactor.addTask(newTask("news.163.com",80,"/index.html"));newThread(reactor,"ReactorThread_"+i).start();}longstart=System.currentTimeMillis();while(true){Thread.sleep(1000);longend=System.currentTimeMillis();floatinterval=((end-start)/1000);intconnectTotal=Reactor.GETCOUNT();intpersistenceTotal=PersistenceHandler.GETCOUNT();intconnectps=Math.round(connectTotal/interval);intpersistenceps=Math.round(persistenceTotal/interval);System.out.print("\r连接总数:"+connectTotal+"\t每秒连接:"+connectps+"\t连接队列剩余:"+Reactor.getQueueSize()+"\t持久化总数:"+persistenceTotal+"\t每秒持久化:"+persistenceps+"\t持久化队列剩余:"+PersistenceHandler.getInstance().getSize());}}}classTask{privateStringhost;privateintport;privateStringcurrentPath;privatelongstarttime;privatelongendtime;privateStringtype;privateStringBuildercontent=newStringBuilder(2400);privateintstate;privatebooleanisValid=true;publicTask(){}publicTask(Stringhost,intport,Stringpath){init(host,port,path);}publicvoidinit(Stringhost,intport,Stringpath){this.setCurrentPath(path);this.host=host;this.port=port;}publiclonggetStarttime(){returnstarttime;}publicvoidsetStarttime(longstarttime){this.starttime=starttime;}publiclonggetEndtime(){returnendtime;}publicvoidsetEndtime(longendtime){this.endtime=endtime;}publicbooleanisValid(){returnisValid;}publicvoidsetValid(booleanisValid){this.isValid=isValid;}publicintgetState(){returnstate;}publicvoidsetState(intstate){this.state=state;}publicStringgetCurrentPath(){returncurrentPath;}publicvoidsetCurrentPath(StringcurrentPath){this.currentPath=currentPath;inti=0;if(currentPath.indexOf("?")!=-1){i=currentPath.indexOf("?");}else{if(currentPath.indexOf("#")!=-1){i=currentPath.indexOf("#");}else{i=currentPath.length();}}this.type=currentPath.substring(currentPath.indexOf(".")+1,i);}publiclonggetTaskTime(){returngetEndtime()-getStarttime();}publicStringgetType(){returntype;}publicvoidsetType(Stringtype){this.type=type;}publicStringgetHost(){returnhost;}publicintgetPort(){returnport;}publicStringBuildergetContent(){returncontent;}publicvoidsetContent(StringBuildercontent){this.content=content;}}classParseHandlerimplementsRunnable{privatestaticfinalSetSET=newHashSet();PersistenceHandlerpersistencehandler=PersistenceHandler.getInstance();Listdomainlist=newArrayList();Tasktask;privateinterfaceFilter{voiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain);}privateclassFilterChainimplementsFilter{privateListlist=newArrayList();{addFilter(newTwoLevel());addFilter(newOneLevel());addFilter(newFullPath());addFilter(newRoot());addFilter(newDefault());}privatevoidaddFilter(Filterfilter){list.add(filter);}privateIteratorit=list.iterator();@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(it.hasNext()){((Filter)it.next()).doFilter(fatherTask,newTask,path,chain);}}}privateclassTwoLevelimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("../../")){Stringprefix=getPrefix(fatherTask.getCurrentPath(),3);newTask.init(fatherTask.getHost(),fatherTask.getPort(),path.replace("../../",prefix));}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassOneLevelimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("../")){Stringprefix=getPrefix(fatherTask.getCurrentPath(),2);newTask.init(fatherTask.getHost(),fatherTask.getPort(),path.replace("../",prefix));}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassFullPathimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("http://")){Iteratorit=domainlist.iterator();booleanflag=false;while(it.hasNext()){Stringdomain=(String)it.next();if(path.startsWith("http://"+domain+"/")){newTask.init(domain,fatherTask.getPort(),path.replace("http://"+domain+"/","/"));flag=true;break;}}if(!flag){newTask.setValid(false);}}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassRootimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.startsWith("/")){newTask.init(fatherTask.getHost(),fatherTask.getPort(),path);}else{chain.doFilter(fatherTask,newTask,path,chain);}}}privateclassDefaultimplementsFilter{@OverridepublicvoiddoFilter(TaskfatherTask,TasknewTask,Stringpath,Filterchain){if(path.contains(":")){newTask.setValid(false);return;}Stringprefix=getPrefix(fatherTask.getCurrentPath(),1);newTask.init(fatherTask.getHost(),fatherTask.getPort(),prefix+"/"+path);}}publicParseHandler(Tasktask,Reactorreactor){this.task=task;this.reactor=reactor;//增加白名单this.domainlist.add("news.163.com");}privateReactorreactor;privatePatternpattern=Pattern.compile("\"[^\"]+\\.htm[^\"]*\"");privatevoidparseTaskState(Tasktask){if(task.getContent().toString().startsWith("HTTP/1.1")){task.setState(Integer.parseInt(task.getContent().substring(9,12)));}else{task.setState(Integer.parseInt(task.getContent().substring(9,12)));}}/***@paramfatherTask*@parampath*@throwsException*/privatevoidcreateNewTask(TaskfatherTask,Stringpath)throwsException{TasknewTask=newTask();FilterChainfilterchain=newFilterChain();filterchain.doFilter(fatherTask,newTask,path,filterchain);if(newTask.isValid()){synchronized(SET){if(SET.contains(newTask.getHost()+newTask.getCurrentPath())){return;}SET.add(newTask.getHost()+newTask.getCurrentPath());}reactor.addTask(newTask);}}privateStringgetPrefix(Strings,intcount){Stringprefix=s;while(count>0){prefix=prefix.substring(0,prefix.lastIndexOf("/"));count--;}return"".equals(prefix)?"/":prefix;}@Overridepublicvoidrun(){try{parseTaskState(task);if(200==task.getState()){Matchermatcher=pattern.matcher(task.getContent());while(matcher.find()){Stringpath=matcher.group();if(!path.contains("")&&!path.contains("\t")&&!path.contains("(")&&!path.contains(")")){path=path.substring(1,path.length()-1);createNewTask(task,path);}}}persistencehandler.addTask(task);}catch(Exceptione){e.printStackTrace();}}}classPersistenceHandlerimplementsRunnable{privatestaticclassSingletonHandler{privatestaticPersistenceHandlerobj=newPersistenceHandler();}publicstaticPersistenceHandlergetInstance(){returnSingletonHandler.obj;}static{try{Class.forName("com.mysql.jdbc.Driver");}catch(ClassNotFoundExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}publicstaticintGETCOUNT(){returnCOUNT.get();}privatestaticfinalAtomicIntegerCOUNT=newAtomicInteger();privateBlockingQueuepersistencelist;publicPersistenceHandler(){this.persistencelist=newLinkedBlockingQueue();newThread(this,"PersistenceThread").start();}publicvoidaddTask(Tasktask){try{this.persistencelist.put(task);}catch(InterruptedExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}publicintgetSize(){returnpersistencelist.size();}privateConnectionconn;privatePreparedStatementps;@Overridepublicvoidrun(){try{conn=DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/mvbox","xx","xx");conn.setAutoCommit(false);ps=conn.prepareStatement("insertintoprobe(host,path,state,tasktime,type,length,createtime)values(?,?,?,?,?,?,?)");}catch(SQLExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}while(true){this.handler();COUNT.addAndGet(1);}}privatevoidhandler(){try{Tasktask=(Task)persistencelist.take();ps.setString(1,task.getHost());ps.setString(2,task.getCurrentPath());ps.setInt(3,task.getState());ps.setLong(4,task.getTaskTime());ps.setString(5,task.getType());ps.setInt(6,task.getContent().toString().length());ps.setTimestamp(7,newTimestamp(task.getEndtime()));ps.addBatch();if(GETCOUNT()%500==0){ps.executeBatch();conn.commit();}}catch(InterruptedExceptione){e.printStackTrace();}catch(SQLExceptione){e.printStackTrace();}}}
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。