newplan

阿基米德在洗澡時發現浮力原理,高興得來不及穿㆖褲子,跑到街㆖大喊:Eureka(我找到了)。
posts - 39, comments - 26, trackbacks - 0, articles - 4
  C++博客 :: 首页 :: 新随笔 :: 联系 :: 聚合  :: 管理
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class scraber implements Runnable
{
    
private String ss;
    
private String url;
    
/**
     * 
     * 该函数爬取网络上跟所需查询单词想通的语句,并存取
     * 
@param ss 流
     * 
@param url 超链接
     
*/

    
public scraber(String ss,String url)
    
{    
        
this.ss=ss;
        
this.url=url;

    }

    
public void run() 
    
{   
            String s
=new String("不错");
            
if(ss.contains(s))
            
{
            
/**
             * 需要对字符串进行一些必要裁剪,使得字符串更加简单,醒目
             
*/

                
int begin=ss.indexOf(s);
                ss 
= ss.substring(begin);
                
int end = 30 < ss.length()? 30:ss.length();
                ss
=ss.substring(0, end);
                System.out.println(url
+"  "+"begin:"+ss);    
            }
       
    }

}

/**
 * 
@author Administrator
 * 在广度遍历的每一个枝点安放一个机器人线程,该线程负责该枝点一下的搜索
 
*/

class irobot implements Runnable
{    
    
private String ss;
    
public irobot(String ss)
    
{
        
this.ss=ss;
    }

    
public void run()
    
{    
        MySearch ms
=new MySearch();
        
/**
        *    线程独立启动对该URL内容的检索
        
*/

        
try
        
{
            ms.checkURL(ss);
        }

        
catch(IOException e)
        
{
            
        }

    }

}

class MySearch{
    
/**
     * 使用线程池来进行大批量线程的管理,这里规定当前最多线程数为10.可重用的线程
     * 不断尝试着和newCachedThreadPool() 进行比较,发现newCachedThreadPool() 比较容易崩溃,但是速度非常快
     * 适合于快速的查询,但是要对查询的深度进行严格限制。使用newFixedThreadPool(10)适度相对慢一些,但是查询稳定,因为
     * 查询已经限制好了在10个线程的范围内,但是到一定程度也会崩溃,这个程度比前面深,最主要是由于内存不足,算法本身对内存的需求很大。
     
*/

    ExecutorService    exec 
= Executors.newFixedThreadPool(10);
    
/**由于多线程公用该变量,直接设置它为静态变量
     * 使用hashMap使得对URL的重复性查询检索效率倍增
     
*/

    
static HashMap<String,Integer>  hashList = new HashMap<String,Integer>();
    
/**由于多线程公用该变量,直接设置它为静态变量
     * URL记录在该文件中
     
*/

    
static File store=new File("d://link.txt");    
    
/**由于多线程公用该变量,直接设置它为静态变量
     * 文件流
     
*/

    
static FileWriter writeFile;  
    
public void checkURL(String uu)throws IOException
    
{                       
        
if(uu.endsWith("/"))
        
{
            uu
=uu.substring(0,uu.length()-1);
        }

        
try{
            URL u 
= new URL(uu);    
            InputStream inn 
=u.openStream();
            
if(hashList.containsKey(uu))
            
{
                Integer numLink
=hashList.get(uu);
                hashList.put(uu, 
++numLink);
                
return;
            }
 
            
            hashList.put(uu, 
1);
            
            writeFile.append(uu
+"\n");
            
            BufferedReader in 
= new BufferedReader(new InputStreamReader(inn,"ISO-8859-1"));
            
            String ss;
            
            ArrayList
<String> tempList=new ArrayList<String>();
            
            
int sureOfCharset = 0
            
/**
             * 当前所搜索的文档,查询一遍,查找所需要查找的句子
             * 并做超链接提取工作,存放到临时链接队列中。
             
*/

            
while((ss=in.readLine())!=null)
            
{
                
/**
                 * 注意对文档的字符编码进行转换,一般是转换成UTF-8格式
                 
*/

                ss
= ss.trim();
                
if(sureOfCharset == 0)
                
{
                    
/**
                     * 用正则表达式匹配更加精准,效率更好
                     
*/

                    
if(ss.contains("charset"))
                    
{
                    
                        String pattern1
="(utf-8|UTF-8)";
                        String pattern2
="(gbk|GBK)";
                        String pattern3
="(gb2312|GB2312)";
                        Pattern p1
=Pattern.compile(pattern1);
                        Pattern p2
=Pattern.compile(pattern2);
                        Pattern p3
=Pattern.compile(pattern3);
                        Matcher m1
=p1.matcher(ss);
                        Matcher m2
=p2.matcher(ss);
                        Matcher m3
=p3.matcher(ss);
                        
/**
                         * 匹配模式一成功,即文档为utf8编码方式
                         
*/

                        
if(m1.find())
                        
{    
                            sureOfCharset 
=1;
                        }

                        
/**
                         * 匹配模式二成功,即文档为GBK编码方式
                         
*/

                        
else if(m2.find())
                        
{
                            sureOfCharset 
=2;
                        }

                        
/**
                         * 匹配模式三成功,即文档为gb312编码方式
                         
*/

                        
else if(m3.find())
                        
{    
                            sureOfCharset 
=3;
                        }

                        
/**
                         * 匹配模式失败
                         
*/

                        
else
                        
{
                            sureOfCharset 
= 4;
                        }

                    }

                    
                }

                
switch(sureOfCharset)
                
{
                
case 0:{    
                        ss 
= new String(ss.getBytes("ISO-8859-1"),"utf-8");
                        
break;    
                        }

                
case 1:{
                        ss 
= new String(ss.getBytes("ISO-8859-1"),"utf-8");
                        
break;    
                        }
                                
                
case 2:{
                        ss 
= new String(ss.getBytes("ISO-8859-1"),"gbk");
                        
break;    
                        }

                
case 3:{
                        ss
= new String(ss.getBytes("ISO-8859-1"),"gb2312");
                        
break;    
                        }

                
default:{    
                        ss 
= new String(ss.getBytes("ISO-8859-1"),"utf-8");
                        
break;    
                        }

                }

                
/**
                 * 开启爬虫线程抓取和“***”有关的语句
                 
*/

                scraber    buger
=new scraber(ss,uu);
                
                exec.execute(buger);
                
/**
                 * 提取超链接
                 
*/

                check(ss,tempList);
                
            }

            
/**
             * 对临时超链接队列进行处理
             
*/

            Iterator
<String> it=tempList.iterator();
            
/**
             * 提取出临时超链接队列中的每一个链接
             
*/

            
while(it.hasNext())
            
{
                
                String ref 
= (String)it.next();
                
/**
                 * 启动一个机器人线程,处理该超链接
                 
*/

                irobot ir
=new irobot(ref);
                Thread robThread
= new Thread(ir);
                robThread.start();
                
            }

            
/**
             * 清空临时队列
             
*/

            tempList.clear();
            
/**
             * 关闭线程池
             
*/

            exec.shutdown();
        }

        
catch(IOException e)
        
{
            
/**
             * 无法连接
             *
            
*/

        }

        
                                                                                            
    }

    
/**
     * 从该行中提取出超链接
     * 
@param s 该行字符串
     * 
@param tempList    临时超链接队列
     * 
     
*/

    
void check(String s,ArrayList<String> tempList)
    
{
        
int i=s.indexOf("href=\"");
        if(i>0)
        
{
            String news
=s.substring(i+6);
            
int j1= news.indexOf("\"");            
            if(j1>0)
            
{
                
                String ref 
= news.substring(0,j1);
                tempList.add(ref);
            }

    
        }


    }


    
public static void main(String args[])throws IOException
    
{    
        
        MySearch t
= new MySearch();
        System.out.println(
"%searching start%");
        t.go(
"http://www.cppblog.com/"); 
    }



    
void go(String uu)
    
{
        
try{
            writeFile
= new FileWriter(store);
            checkURL(uu);
            writeFile.close();
        }
catch(IOException e)
        
{
            
        }

    
    }

}


只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   博问   Chat2DB   管理