Python+Java简单抓取天气信息
背景:
使用Python抓取网站的各省份的天气url地址,写入到json文件中,使用Java读取json文件,
根据地名获取到查询天气的url地址,访问url地址,抓取天气信息,
此处用到的网站地址:http://tianqi.8684.cn
Python抓取网站的各省份天气部分:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import re import json import requests import random from lxml import etree ''' USER_AGENTS 随机头信息 ''' USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" , "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] HEADER = { 'User-Agent' : random.choice(USER_AGENTS), 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'en-US,en;q=0.5' , 'Connection' : 'keep-alive' , 'Accept-Encoding' : 'gzip, deflate' } #抓取各省份的天气URL def catch_province(): try : global TQ_URL res = requests.get(TQ_URL,headers = HEADER,timeout = 20 ) text = res.content.decode( 'utf-8' , 'ignore' ) if text is not None : html = etree.HTML(text) #"//div[@class='b1 mb10']/div[@class='p-sort']/a" wealist = html.xpath( "//div[@class='b1 mb10'][2]" )[ 0 ].xpath( "//div[@class='p-sort']/a" ) for each in wealist: sub_url = TQ_URL + each.get( "href" ) strs = "{} => {}" . format (each.text,sub_url) #print(strs) catch_sub_city(sub_url,each.text) except Exception as e: print (e) def catch_sub_city(url, provinceName): global TQ_URL res = requests.get(url,headers = HEADER,timeout = 20 ) text = res.content.decode( 'utf-8' , 'ignore' ) if text is not None : html = etree.HTML(text) wealist = html.xpath( "//div[@class='b1 mb10 oh']/ul[@class='w-province']/li/p/a" ) for each in wealist: strs = "{\"city\":\"%s\",\"url\":\"%s\"}," % (each.text,TQ_URL + each.get( "href" )) print (strs) catch_province() |
Python部分使用Python3,需要用到lxml、requests模块
抓取的结果存储为json,这里只列举部分结果:
1 2 3 4 5 6 7 8 9 10 11 | [ .... ] |
WeatherCity类,城市:查询天气的URL地址:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | package com.wjyup.vo; import java.io.Serializable; public class WeatherCity implements Serializable { private static final long serialVersionUID = 1L; private String city; //城市名称 private String url; //查询天气的url public WeatherCity() { } public String getCity() { return city; } public void setCity(String city) { this .city = city; } public String getUrl() { return url; } public void setUrl(String url) { this .url = url; } } |
Java读取json,根据地名查询天气的Junit测试类:
1 2 3 4 5 6 7 8 9 10 11 | package junit; import org.junit.runner.RunWith; import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; @RunWith (SpringJUnit4ClassRunner. class ) @ContextConfiguration (locations = "classpath:applicationContext.xml" ) public class SpringJunit { } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | package junit; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.commons.lang3.StringUtils; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.junit.Test; import com.alibaba.fastjson.JSONArray; import com.wjyup.vo.WeatherCity; import us.codecraft.xsoup.Xsoup; public class ResourcesTest1 extends SpringJunit { private HashMap<String, String> weatherCity = new HashMap<>( 2586 ); @Test public void weatherCityLoadingTest(){ try { InputStream input = ResourcesTest1. class .getResourceAsStream( "/weatherCity.json" ); if (input != null && input.available() > 0 ){ byte [] b = new byte [input.available()]; input.read(b); input.close(); String json = new String(b); List<WeatherCity> list = JSONArray.parseArray(json, WeatherCity. class ); long start = System.currentTimeMillis(); //写入缓存 for (WeatherCity wc : list){ weatherCity.put(wc.getCity(), wc.getUrl()); } long end = System.currentTimeMillis(); //调用测试 long start1 = System.currentTimeMillis(); String result = queryWeatherInfo( "北京" ); long end1 = System.currentTimeMillis(); System.out.println(result); System.out.println( "添加=耗时:" +(end-start)); System.out.println( "查询=耗时:" +(end1-start1)); } } catch (Exception e) { e.printStackTrace(); } } /** * 根据城市名称查询天气L * @param cityName 城市名称 * @return 查询的天气信息 */ private String queryWeatherInfo(String cityName){ if (StringUtils.isBlank(cityName)) return null ; String weather = "未查询到查询[%s]的天气信息!" ; String url = null ; Iterator<Entry<String, String>> it = weatherCity.entrySet().iterator(); while (it.hasNext()){ Entry<String, String> entry = it.next(); if (entry.getKey().equals(cityName)){ url = entry.getValue(); break ; } } if (StringUtils.isNotBlank(url)){ //抓取天气信息 try { Response resp = Jsoup.connect(url) .userAgent( "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" ) .timeout( 10000 ) .execute(); StringBuffer tq = new StringBuffer(); if (resp.statusCode() == 200 ){ Document doc = resp.parse(); List<Element> list = Xsoup.select(doc, "//div[@class='w-forecast mb10']/div" ).getElements(); list = Xsoup.select(list.get( 0 ), "//ul[@class='wf-mod wicon']/li" ).getElements(); Element last = list.get(list.size() - 1 ); for (Element el : list){ // System.out.println(el.text()); //日期 String date = Xsoup.select(el, "//span/text()" ).get(); //气温 String temperature = Xsoup.select(el, "//div/p/text()" ).get(); //其他 List<Element> tempList = Xsoup.select(el, "//div/em" ).getElements(); tq.append(date+ " " +temperature+ " " ); for (Element e : tempList){ tq.append(e.text()+ " " ); } if (last != el){ tq.append( "\n" ); } } if (tq.length() > 10 ){ weather = cityName+ "天气信息如下:\n" +tq.toString(); } } else { weather = String.format(weather, cityName); } } catch (IOException e) { e.printStackTrace(); weather = String.format(weather, cityName); } } else { weather = String.format(weather, cityName); } return weather; } } |
Java部分需要用到的jar包,maven配置文件:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | < dependency > < groupId >junit</ groupId > < artifactId >junit</ artifactId > < version >4.12</ version > < scope >test</ scope > </ dependency > < dependency > < groupId >com.alibaba</ groupId > < artifactId >fastjson</ artifactId > < version >1.1.37</ version > </ dependency > < dependency > < groupId >org.jsoup</ groupId > < artifactId >jsoup</ artifactId > < version >1.10.1</ version > </ dependency > < dependency > < groupId >us.codecraft</ groupId > < artifactId >xsoup</ artifactId > < version >0.3.1</ version > </ dependency > < dependency > < groupId >org.apache.commons</ groupId > < artifactId >commons-lang3</ artifactId > < version >3.1</ version > </ dependency > < dependency > < groupId >commons-lang</ groupId > < artifactId >commons-lang</ artifactId > < version >2.6</ version > </ dependency > |