Just do your best!!!

Python+Java简单抓取天气信息

背景:

使用Python抓取网站的各省份的天气url地址,写入到json文件中,使用Java读取json文件,
根据地名获取到查询天气的url地址,访问url地址,抓取天气信息,
此处用到的网站地址:http://tianqi.8684.cn

Python抓取网站的各省份天气部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
import json
import requests
import random
from lxml import etree
 
 
'''
USER_AGENTS 随机头信息
'''
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
 
HEADER = {
    'User-Agent': random.choice(USER_AGENTS),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate'
}
 
#抓取各省份的天气URL
def catch_province():
    try:
        global TQ_URL
        res = requests.get(TQ_URL,headers=HEADER,timeout=20)
        text = res.content.decode('utf-8','ignore')
        if text is not None:
            html = etree.HTML(text)
            #"//div[@class='b1 mb10']/div[@class='p-sort']/a"
            wealist = html.xpath("//div[@class='b1 mb10'][2]")[0].xpath("//div[@class='p-sort']/a")
            for each in wealist:
                sub_url = TQ_URL+each.get("href")
                strs = "{} => {}".format(each.text,sub_url)
                #print(strs)
                catch_sub_city(sub_url,each.text)
             
    except Exception as e:
        print(e)
 
def catch_sub_city(url, provinceName):
    global TQ_URL
    res = requests.get(url,headers=HEADER,timeout=20)
    text = res.content.decode('utf-8','ignore')
    if text is not None:
        html = etree.HTML(text)
        wealist = html.xpath("//div[@class='b1 mb10 oh']/ul[@class='w-province']/li/p/a")
        for each in wealist:
            strs = "{\"city\":\"%s\",\"url\":\"%s\"},"%(each.text,TQ_URL+each.get("href"))
            print(strs)
 
catch_province()

Python部分使用Python3,需要用到lxml、requests模块

抓取的结果存储为json,这里只列举部分结果:

1
2
3
4
5
6
7
8
9
10
11
[
{"city":"安庆","url":"http://tianqi.8684.cn/anhui_anqing"},
{"city":"枞阳","url":"http://tianqi.8684.cn/anhui_congyang"},
{"city":"怀宁","url":"http://tianqi.8684.cn/anhui_huaining"},
{"city":"潜山","url":"http://tianqi.8684.cn/anhui_qianshan"},
{"city":"宿松","url":"http://tianqi.8684.cn/anhui_susong"},
{"city":"太湖","url":"http://tianqi.8684.cn/anhui_taihu"},
{"city":"桐城","url":"http://tianqi.8684.cn/anhui_tongcheng"},
{"city":"望江","url":"http://tianqi.8684.cn/anhui_wangjiang"}
....
]

WeatherCity类,城市:查询天气的URL地址:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package com.wjyup.vo;
 
import java.io.Serializable;
 
public class WeatherCity implements Serializable {
 
    private static final long serialVersionUID = 1L;
 
    private String city;//城市名称
    private String url;//查询天气的url
     
    public WeatherCity() {
    }
 
    public String getCity() {
        return city;
    }
 
    public void setCity(String city) {
        this.city = city;
    }
 
    public String getUrl() {
        return url;
    }
 
    public void setUrl(String url) {
        this.url = url;
    }
     
}

Java读取json,根据地名查询天气的Junit测试类:

1
2
3
4
5
6
7
8
9
10
11
package junit;
 
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
 
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = "classpath:applicationContext.xml")
public class SpringJunit {
 
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package junit;
 
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
 
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
 
import com.alibaba.fastjson.JSONArray;
import com.wjyup.vo.WeatherCity;
 
import us.codecraft.xsoup.Xsoup;
 
public class ResourcesTest1 extends SpringJunit {
    private HashMap<String, String> weatherCity = new HashMap<>(2586);
    @Test
    public void weatherCityLoadingTest(){
        try {
            InputStream input = ResourcesTest1.class.getResourceAsStream("/weatherCity.json");
            if(input != null && input.available() > 0){
                byte[] b = new byte[input.available()];
                input.read(b);
                input.close();
                String json = new String(b);
                List<WeatherCity> list = JSONArray.parseArray(json, WeatherCity.class);
                long start = System.currentTimeMillis();
                //写入缓存
                for(WeatherCity wc : list){
                    weatherCity.put(wc.getCity(), wc.getUrl());
                }
                long end = System.currentTimeMillis();
                //调用测试
                long start1 = System.currentTimeMillis();
                String result = queryWeatherInfo("北京");
                long end1 = System.currentTimeMillis();
                System.out.println(result);
                System.out.println("添加=耗时:"+(end-start));
                System.out.println("查询=耗时:"+(end1-start1));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
     
    /**
     * 根据城市名称查询天气L
     * @param cityName 城市名称
     * @return 查询的天气信息
     */
    private String queryWeatherInfo(String cityName){
        if(StringUtils.isBlank(cityName)) return null;
        String weather = "未查询到查询[%s]的天气信息!";
        String url = null;
        Iterator<Entry<String, String>> it = weatherCity.entrySet().iterator();
        while(it.hasNext()){
            Entry<String, String> entry = it.next();
            if(entry.getKey().equals(cityName)){
                url = entry.getValue();
                break;
            }
        }
        if(StringUtils.isNotBlank(url)){
            //抓取天气信息
            try {
                Response resp = Jsoup.connect(url)
                        .userAgent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
                        .timeout(10000)
                        .execute();
                StringBuffer tq = new StringBuffer();
                if(resp.statusCode() == 200){
                    Document doc = resp.parse();
                    List<Element> list = Xsoup.select(doc, "//div[@class='w-forecast mb10']/div").getElements();
                    list = Xsoup.select(list.get(0), "//ul[@class='wf-mod wicon']/li").getElements();
                    Element last = list.get(list.size() - 1);
                    for(Element el : list){
//                        System.out.println(el.text());
                        //日期
                        String date = Xsoup.select(el, "//span/text()").get();
                        //气温
                        String temperature = Xsoup.select(el, "//div/p/text()").get();
                        //其他
                        List<Element> tempList = Xsoup.select(el, "//div/em").getElements();
                        tq.append(date+" "+temperature+" ");
                        for(Element e : tempList){
                            tq.append(e.text()+" ");
                        }
                        if(last != el){
                            tq.append("\n");
                        }
                    }
                    if(tq.length() > 10){
                        weather = cityName+"天气信息如下:\n"+tq.toString();
                    }
                }else{
                    weather = String.format(weather, cityName);
                }
            } catch (IOException e) {
                e.printStackTrace();
                weather = String.format(weather, cityName);
            }
        }else{
            weather = String.format(weather, cityName);
        }
        return weather;
    }
}

Java部分需要用到的jar包,maven配置文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
    <scope>test</scope>
</dependency>
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.1.37</version>
</dependency>
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.1</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>xsoup</artifactId>
    <version>0.3.1</version>
</dependency>
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.1</version>
</dependency>
 
<dependency>
    <groupId>commons-lang</groupId>
    <artifactId>commons-lang</artifactId>
    <version>2.6</version>
</dependency>