Python+Java简单抓取天气信息
背景:
使用Python抓取网站的各省份的天气url地址,写入到json文件中,使用Java读取json文件,
根据地名获取到查询天气的url地址,访问url地址,抓取天气信息,
此处用到的网站地址:http://tianqi.8684.cn
Python抓取网站的各省份天气部分:
import re
import json
import requests
import random
from lxml import etree
'''
USER_AGENTS 随机头信息
'''
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
HEADER = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate'
}
#抓取各省份的天气URL
TQ_URL = "http://tianqi.8684.cn"
def catch_province():
try:
global TQ_URL
res = requests.get(TQ_URL,headers=HEADER,timeout=20)
text = res.content.decode('utf-8','ignore')
if text is not None:
html = etree.HTML(text)
#"//div[@class='b1 mb10']/div[@class='p-sort']/a"
wealist = html.xpath("//div[@class='b1 mb10'][2]")[0].xpath("//div[@class='p-sort']/a")
for each in wealist:
sub_url = TQ_URL+each.get("href")
strs = "{} => {}".format(each.text,sub_url)
#print(strs)
catch_sub_city(sub_url,each.text)
except Exception as e:
print(e)
def catch_sub_city(url, provinceName):
global TQ_URL
res = requests.get(url,headers=HEADER,timeout=20)
text = res.content.decode('utf-8','ignore')
if text is not None:
html = etree.HTML(text)
wealist = html.xpath("//div[@class='b1 mb10 oh']/ul[@class='w-province']/li/p/a")
for each in wealist:
strs = "{\"city\":\"%s\",\"url\":\"%s\"},"%(each.text,TQ_URL+each.get("href"))
print(strs)
catch_province()
Python部分使用Python3,需要用到lxml、requests模块
抓取的结果存储为json,这里只列举部分结果:
[
{"city":"安庆","url":"http://tianqi.8684.cn/anhui_anqing"},
{"city":"枞阳","url":"http://tianqi.8684.cn/anhui_congyang"},
{"city":"怀宁","url":"http://tianqi.8684.cn/anhui_huaining"},
{"city":"潜山","url":"http://tianqi.8684.cn/anhui_qianshan"},
{"city":"宿松","url":"http://tianqi.8684.cn/anhui_susong"},
{"city":"太湖","url":"http://tianqi.8684.cn/anhui_taihu"},
{"city":"桐城","url":"http://tianqi.8684.cn/anhui_tongcheng"},
{"city":"望江","url":"http://tianqi.8684.cn/anhui_wangjiang"}
....
]
WeatherCity类,城市:查询天气的URL地址:
package com.wjyup.vo;
import java.io.Serializable;
public class WeatherCity implements Serializable {
private static final long serialVersionUID = 1L;
private String city;//城市名称
private String url;//查询天气的url
public WeatherCity() {
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
Java读取json,根据地名查询天气的Junit测试类:
package junit;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = "classpath:applicationContext.xml")
public class SpringJunit {
}
package junit;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import com.alibaba.fastjson.JSONArray;
import com.wjyup.vo.WeatherCity;
import us.codecraft.xsoup.Xsoup;
public class ResourcesTest1 extends SpringJunit {
private HashMap<String, String> weatherCity = new HashMap<>(2586);
@Test
public void weatherCityLoadingTest(){
try {
InputStream input = ResourcesTest1.class.getResourceAsStream("/weatherCity.json");
if(input != null && input.available() > 0){
byte[] b = new byte[input.available()];
input.read(b);
input.close();
String json = new String(b);
List<WeatherCity> list = JSONArray.parseArray(json, WeatherCity.class);
long start = System.currentTimeMillis();
//写入缓存
for(WeatherCity wc : list){
weatherCity.put(wc.getCity(), wc.getUrl());
}
long end = System.currentTimeMillis();
//调用测试
long start1 = System.currentTimeMillis();
String result = queryWeatherInfo("北京");
long end1 = System.currentTimeMillis();
System.out.println(result);
System.out.println("添加=耗时:"+(end-start));
System.out.println("查询=耗时:"+(end1-start1));
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 根据城市名称查询天气L
* @param cityName 城市名称
* @return 查询的天气信息
*/
private String queryWeatherInfo(String cityName){
if(StringUtils.isBlank(cityName)) return null;
String weather = "未查询到查询[%s]的天气信息!";
String url = null;
Iterator<Entry<String, String>> it = weatherCity.entrySet().iterator();
while(it.hasNext()){
Entry<String, String> entry = it.next();
if(entry.getKey().equals(cityName)){
url = entry.getValue();
break;
}
}
if(StringUtils.isNotBlank(url)){
//抓取天气信息
try {
Response resp = Jsoup.connect(url)
.userAgent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
.timeout(10000)
.execute();
StringBuffer tq = new StringBuffer();
if(resp.statusCode() == 200){
Document doc = resp.parse();
List<Element> list = Xsoup.select(doc, "//div[@class='w-forecast mb10']/div").getElements();
list = Xsoup.select(list.get(0), "//ul[@class='wf-mod wicon']/li").getElements();
Element last = list.get(list.size() - 1);
for(Element el : list){
// System.out.println(el.text());
//日期
String date = Xsoup.select(el, "//span/text()").get();
//气温
String temperature = Xsoup.select(el, "//div/p/text()").get();
//其他
List<Element> tempList = Xsoup.select(el, "//div/em").getElements();
tq.append(date+" "+temperature+" ");
for(Element e : tempList){
tq.append(e.text()+" ");
}
if(last != el){
tq.append("\n");
}
}
if(tq.length() > 10){
weather = cityName+"天气信息如下:\n"+tq.toString();
}
}else{
weather = String.format(weather, cityName);
}
} catch (IOException e) {
e.printStackTrace();
weather = String.format(weather, cityName);
}
}else{
weather = String.format(weather, cityName);
}
return weather;
}
}
Java部分需要用到的jar包,maven配置文件:
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.37</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>