Just do your best!!!

Python+Java简单抓取天气信息

背景:

使用Python抓取网站的各省份的天气url地址,写入到json文件中,使用Java读取json文件,
根据地名获取到查询天气的url地址,访问url地址,抓取天气信息,
此处用到的网站地址:http://tianqi.8684.cn

Python抓取网站的各省份天气部分:

import re
import json
import requests
import random
from lxml import etree


'''
USER_AGENTS 随机头信息
'''
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

HEADER = {
    'User-Agent': random.choice(USER_AGENTS),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate'
}

#抓取各省份的天气URL
TQ_URL = "http://tianqi.8684.cn"
def catch_province():
    try:
        global TQ_URL
        res = requests.get(TQ_URL,headers=HEADER,timeout=20)
        text = res.content.decode('utf-8','ignore')
        if text is not None:
            html = etree.HTML(text)
            #"//div[@class='b1 mb10']/div[@class='p-sort']/a"
            wealist = html.xpath("//div[@class='b1 mb10'][2]")[0].xpath("//div[@class='p-sort']/a")
            for each in wealist:
                sub_url = TQ_URL+each.get("href")
                strs = "{} => {}".format(each.text,sub_url)
                #print(strs)
                catch_sub_city(sub_url,each.text)
            
    except Exception as e:
        print(e)

def catch_sub_city(url, provinceName):
    global TQ_URL
    res = requests.get(url,headers=HEADER,timeout=20)
    text = res.content.decode('utf-8','ignore')
    if text is not None:
        html = etree.HTML(text)
        wealist = html.xpath("//div[@class='b1 mb10 oh']/ul[@class='w-province']/li/p/a")
        for each in wealist:
            strs = "{\"city\":\"%s\",\"url\":\"%s\"},"%(each.text,TQ_URL+each.get("href"))
            print(strs)

catch_province()

Python部分使用Python3,需要用到lxml、requests模块

抓取的结果存储为json,这里只列举部分结果:

[
{"city":"安庆","url":"http://tianqi.8684.cn/anhui_anqing"},
{"city":"枞阳","url":"http://tianqi.8684.cn/anhui_congyang"},
{"city":"怀宁","url":"http://tianqi.8684.cn/anhui_huaining"},
{"city":"潜山","url":"http://tianqi.8684.cn/anhui_qianshan"},
{"city":"宿松","url":"http://tianqi.8684.cn/anhui_susong"},
{"city":"太湖","url":"http://tianqi.8684.cn/anhui_taihu"},
{"city":"桐城","url":"http://tianqi.8684.cn/anhui_tongcheng"},
{"city":"望江","url":"http://tianqi.8684.cn/anhui_wangjiang"}
....
]

WeatherCity类,城市:查询天气的URL地址:

package com.wjyup.vo;

import java.io.Serializable;

public class WeatherCity implements Serializable {

    private static final long serialVersionUID = 1L;

    private String city;//城市名称
    private String url;//查询天气的url
    
    public WeatherCity() {
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }
    
}

Java读取json,根据地名查询天气的Junit测试类:

package junit;

import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = "classpath:applicationContext.xml")
public class SpringJunit {

}
package junit;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import com.alibaba.fastjson.JSONArray;
import com.wjyup.vo.WeatherCity;

import us.codecraft.xsoup.Xsoup;

public class ResourcesTest1 extends SpringJunit {
    private HashMap<String, String> weatherCity = new HashMap<>(2586);
    @Test
    public void weatherCityLoadingTest(){
        try {
            InputStream input = ResourcesTest1.class.getResourceAsStream("/weatherCity.json");
            if(input != null && input.available() > 0){
                byte[] b = new byte[input.available()];
                input.read(b);
                input.close();
                String json = new String(b);
                List<WeatherCity> list = JSONArray.parseArray(json, WeatherCity.class);
                long start = System.currentTimeMillis();
                //写入缓存
                for(WeatherCity wc : list){
                    weatherCity.put(wc.getCity(), wc.getUrl());
                }
                long end = System.currentTimeMillis();
                //调用测试
                long start1 = System.currentTimeMillis();
                String result = queryWeatherInfo("北京");
                long end1 = System.currentTimeMillis();
                System.out.println(result);
                System.out.println("添加=耗时:"+(end-start));
                System.out.println("查询=耗时:"+(end1-start1));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * 根据城市名称查询天气L
     * @param cityName 城市名称
     * @return 查询的天气信息
     */
    private String queryWeatherInfo(String cityName){
        if(StringUtils.isBlank(cityName)) return null;
        String weather = "未查询到查询[%s]的天气信息!";
        String url = null;
        Iterator<Entry<String, String>> it = weatherCity.entrySet().iterator();
        while(it.hasNext()){
            Entry<String, String> entry = it.next();
            if(entry.getKey().equals(cityName)){
                url = entry.getValue();
                break;
            }
        }
        if(StringUtils.isNotBlank(url)){
            //抓取天气信息
            try {
                Response resp = Jsoup.connect(url)
                        .userAgent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
                        .timeout(10000)
                        .execute();
                StringBuffer tq = new StringBuffer();
                if(resp.statusCode() == 200){
                    Document doc = resp.parse();
                    List<Element> list = Xsoup.select(doc, "//div[@class='w-forecast mb10']/div").getElements();
                    list = Xsoup.select(list.get(0), "//ul[@class='wf-mod wicon']/li").getElements();
                    Element last = list.get(list.size() - 1);
                    for(Element el : list){
//                        System.out.println(el.text());
                        //日期
                        String date = Xsoup.select(el, "//span/text()").get();
                        //气温
                        String temperature = Xsoup.select(el, "//div/p/text()").get();
                        //其他
                        List<Element> tempList = Xsoup.select(el, "//div/em").getElements();
                        tq.append(date+" "+temperature+" ");
                        for(Element e : tempList){
                            tq.append(e.text()+" ");
                        }
                        if(last != el){
                            tq.append("\n");
                        }
                    }
                    if(tq.length() > 10){
                        weather = cityName+"天气信息如下:\n"+tq.toString();
                    }
                }else{
                    weather = String.format(weather, cityName);
                }
            } catch (IOException e) {
                e.printStackTrace();
                weather = String.format(weather, cityName);
            }
        }else{
            weather = String.format(weather, cityName);
        }
        return weather;
    }
}

Java部分需要用到的jar包,maven配置文件:

    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.12</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.1.37</version>
    </dependency>
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.10.1</version>
    </dependency>
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>xsoup</artifactId>
        <version>0.3.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.commons</groupId>
        <artifactId>commons-lang3</artifactId>
        <version>3.1</version>
    </dependency>
    
    <dependency>
        <groupId>commons-lang</groupId>
        <artifactId>commons-lang</artifactId>
        <version>2.6</version>
    </dependency>