这篇文章主要介绍了Java怎么爬取网页内容并输出到Excel中的相关知识,内容详细易懂,操作简单快捷,具有一定借鉴价值,相信大家阅读完这篇Java怎么爬取网页内容并输出到Excel中文章都会有所收获,下面我们一起来看看吧。

爬虫技术简述

网络爬虫(Web Crawler),按照一定的规则,自动抓取万维网信息的程序或者脚本,如今被广泛地应用在互联网搜索引擎或者其他类似网站。

爬虫在功能上分为采集、处理和储存三个部分。

爬虫基本上可以分为三大类:分布式爬虫、Java爬虫以及非Java爬虫。

在Java爬虫中又可以细分出三种,Crawler4j、WebMagic、WebCollector。

实例代码

添加依赖

<!--json--><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-databind</artifactId><version>2.12.0</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.47</version></dependency><!--excel--><dependency><groupId>net.sourceforge.jexcelapi</groupId><artifactId>jxl</artifactId><version>2.6.12</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>3.17</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>3.17</version></dependency><!--爬虫--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.13.1</version></dependency>

创建一个Weather实体类

publicclassWeather{/***日期*/privateStringdate;/***最高气温*/privateStringmaxTemperature;/***最低气温*/privateStringminTemperature;/***白天天气*/privateStringdayTimeWeather;/***夜间天气*/privateStringnightWeather;/***风向*/privateStringwindDirection;/***风力*/privateStringwindPower;publicStringgetDate(){returndate;}publicvoidsetDate(Stringdate){this.date=date;}publicStringgetMaxTemperature(){returnmaxTemperature;}publicvoidsetMaxTemperature(StringmaxTemperature){this.maxTemperature=maxTemperature;}publicStringgetMinTemperature(){returnminTemperature;}publicvoidsetMinTemperature(StringminTemperature){this.minTemperature=minTemperature;}publicStringgetDayTimeWeather(){returndayTimeWeather;}publicvoidsetDayTimeWeather(StringdayTimeWeather){this.dayTimeWeather=dayTimeWeather;}publicStringgetNightWeather(){returnnightWeather;}publicvoidsetNightWeather(StringnightWeather){this.nightWeather=nightWeather;}publicStringgetWindDirection(){returnwindDirection;}publicvoidsetWindDirection(StringwindDirection){this.windDirection=windDirection;}publicStringgetWindPower(){returnwindPower;}publicvoidsetWindPower(StringwindPower){this.windPower=windPower;}@OverridepublicStringtoString(){return"Weather{"+"date='"+date+'\''+",maxTemperature='"+maxTemperature+'\''+",minTemperature='"+minTemperature+'\''+",dayTimeWeather='"+dayTimeWeather+'\''+",nightWeather='"+nightWeather+'\''+",windDirection='"+windDirection+'\''+",windPower='"+windPower+'\''+'}';}}

创建一个WeatherTest测试类

publicclassWeatherTest{publicstaticvoidmain(String[]args)throwsFileNotFoundException,IOException{List<Weather>list=getInfo("http://www.tianqi234.com/2020shanghai/1yue.html",12);for(Weatherweather:list){System.out.println(weather.toString());}testHSSFWorkbook(list);}//可以指定网址,并且按照需求爬取前多少页的数据publicstaticList<Weather>getInfo(Stringurl,intmonth){List<Weather>weatherList=newArrayList<Weather>();for(inti=1;i<month+1;i++){try{System.out.println("url:"+url);Documentdoc=Jsoup.connect(url).get();Elementstable=doc.select(".graybox_cnt");/*Elementstbody=table.select("tbody");*/ElementstrList=table.select("tr");//每次移除的时候,你的列表长度就会发生新的变化,所以要结合实际进行移除trList.remove(0);if(i>1){trList.remove(0);trList.remove(10);trList.remove(10);trList.remove(20);trList.remove(20);trList.remove(20);}else{trList.remove(11);trList.remove(11);trList.remove(21);trList.remove(21);trList.remove(21);}for(Elementtr:trList){ElementstdList=tr.select("td");ElementsaList=tdList.select("a");//查询a标签Weatherweather=newWeather();if(aList!=null&&aList.size()>0){weather.setDate(aList.get(0).html().toString());}else{weather.setDate(tdList.get(0).html().toString());}weather.setMaxTemperature(tdList.get(1).html().toString());weather.setMinTemperature(tdList.get(2).html().toString());weather.setDayTimeWeather(tdList.get(3).html().toString());weather.setNightWeather(tdList.get(4).html().toString());weather.setWindDirection(tdList.get(5).html().toString());weather.setWindPower(tdList.get(6).html().toString());weatherList.add(weather);}}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}url="http://www.tianqi234.com/2020shanghai/"+(i+1)+"yue.html";}returnweatherList;}publicstaticvoidtestHSSFWorkbook(List<Weather>list)throwsIOException{HSSFWorkbookworkbook=newHSSFWorkbook();//创建excel文件(workbook)HSSFSheetsheet=workbook.createSheet("2020年上海天气统计");HSSFRowrow=sheet.createRow(0);//创建行从0开始HSSFCellStylestyle=workbook.createCellStyle();//设置单元格样式style.setAlignment(HorizontalAlignment.CENTER);//水平居中style.setVerticalAlignment(VerticalAlignment.CENTER);//垂直居中SimpleDateFormatformat=newSimpleDateFormat("yyyy-MM-ddHH:mm:ss");sheet.setDefaultColumnWidth(30);row.setHeightInPoints(25);Map<String,String>map=(Map<String,String>)getMap(list.get(0));//设置表头intc=0;for(Stringkey:map.keySet()){HSSFCellcell=row.createCell(c);//创建行的单元格,从0开始cell.setCellValue(map.get(key));//设置单元格内容cell.setCellStyle(style);c++;}Map<Integer,Weather>weatherMap=newHashMap<>();//除去表头for(inti=1;i<list.size();i++){weatherMap.put(i,list.get(i));}for(inti=1;i<=weatherMap.size();i++){HSSFRowrowInfo=sheet.createRow(i);rowInfo.setHeightInPoints(30);Map<String,String>map1=(Map<String,String>)getMap(list.get(i));intj=0;for(Stringkey:map1.keySet()){HSSFCellcellInfo=rowInfo.createCell(j);cellInfo.setCellValue(map1.get(key));cellInfo.setCellStyle(style);j++;}}FileOutputStreamout=newFileOutputStream("D:\\weather1.xlsx");workbook.write(out);out.close();}/***json转map**@paramobject*@return*/publicstaticMap<?,?>getMap(Objectobject){if(object==null){thrownewRuntimeException("对象为空,转json失败");}Map<String,Object>map=newHashMap<>();try{map=(Map)JSON.parse(JSON.toJSONString(object));}catch(Exceptione){System.out.println("对象转map转换失败");}returnmap;}}

关于“Java怎么爬取网页内容并输出到Excel中”这篇文章的内容就介绍到这里,感谢各位的阅读!相信大家对“Java怎么爬取网页内容并输出到Excel中”知识都有一定的了解,大家如果还想学习更多知识,欢迎关注亿速云行业资讯频道。