大众点评spyder
# encoding=utf-8
import re
import urllib2
from math import *
from bs4 import BeautifulSoup
# 请求头部信息
header = {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
# {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
#测试brand_name
brand_name=["星巴克","麦当劳","肯德基","小南国","真功夫","汉拿山","新辣道","新疆火宴山","味千拉面"]
print city_code.split("\n")
class ShopBindHandle(object):
def __init__(self,base_page_url="http://www.dianping.com/search/keyword/{}/0_{}/p{}",city=u"北京",wifi_id=None,brand_name=None):
self.base_page_url = base_page_url
self.city=city
self.wifi_id = wifi_id
self.city_code = self.get_city_code()
self.brand_name=brand_name
#参数为两个字典,比如:{"longitude":111,"latitude":322},{"longitude":115551,"latitude":322333}
def calc_distance(self,geo1,geo2):
Lat_A, Lng_A, Lat_B, Lng_B = geo1.get("latitude"),geo1.get("latitude"), geo2.get("latitude"),geo2.get("latitude")
ra = 6378.140 # 赤道半径 (km)
rb = 6356.755 # 极半径 (km)
flatten = (ra - rb) / ra # 地球扁率
rad_lat_A = radians(Lat_A)
rad_lng_A = radians(Lng_A)
rad_lat_B = radians(Lat_B)
rad_lng_B = radians(Lng_B)
pA = atan(rb / ra * tan(rad_lat_A))
pB = atan(rb / ra * tan(rad_lat_B))
xx = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(rad_lng_A - rad_lng_B))
c1 = (sin(xx) - xx) * (sin(pA) + sin(pB)) ** 2 / cos(xx / 2) ** 2
c2 = (sin(xx) + xx) * (sin(pA) - sin(pB)) ** 2 / sin(xx / 2) ** 2
dr = flatten / 8 * (c1 - c2)
distance = ra * (xx + dr)
return distance
#得到城市对应的编号,如:(上海,1)
def get_city_code(self):
pass
# 得到所有大众点评上面所有的brand_name相关的连接
def get_brand_urls(self,brand_name):
try:
for i in xrange(1,1000):
res = urllib2.urlopen(urllib2.Request(self.base_page_url.format(self.city_code,brand_name,i), None, header), timeout=10)
for brand_shop_url in BeautifulSoup(res).findAll("a", attrs={"data-hippo-type": "shop"}):
if brand_name in brand_shop_url.get_text():
yield "http://www.dianping.com"+brand_shop_url.get("href")
except:
return
# 得到大众点评上面对应的商店的id和geo
def get_shop_id(self,shop_url):
soup = BeautifulSoup(urllib2.urlopen(shop_url))
_geo_text = soup.select_one("#aside script").get_text()
_p = re.compile(r"(lng:.*,lat:.*)}").findall(_geo_text)[0]#[u'lng:121.47835,lat:31.22062']
return {"shop_id":shop_url.split("/")[-1],
"geo":{"longitude":_p.split(",")[0].split(":")[-1],"latitude":_p.split(",")[1].split(":")[-1]}
}
# print get_shop_id("http://www.dianping.com/shop/22680632")
def get_all_shops(brand_name):
for brand_url in get_brand_urls(brand_name):
if not brand_url:
continue
yield get_shop_id(brand_url)
# 得到唯一的shop_id
def get_bind(self,brand_name,geo):
dis0=-1
shop0=None
for shop in self.get_all_shops(brand_name):
dis = self.calc_distance(shop.geo,geo)
if dis > dis0:
shop0=shop
dis0 = dis
return shop0.shop_id,self.wifi_id
def get_shop_info(shop_url):
res = urllib2.urlopen(shop_url)
soup = BeautifulSoup(res.read())
# basic_info = soup.find(id="basic-info")
# brand_name = basic_info.h1.get_text().strip().split("\n")[0]
# print brand_name
# brief_info = basic_info.select(".brief-info span")
# star = brief_info[0].attrs.get("title")
# print star
# judge = brief_info[1].get_text()
# print judge
# avg = brief_info[2].get_text()
# print avg
# taste = brief_info[3].get_text()
# print taste
# env = brief_info[4].get_text()
# print env
# serve = brief_info[5].get_text()
# print serve
# address = basic_info.select(".expand-info.address")[0].get_text().replace("\n","").replace(" ","")
# print address
# tel = basic_info.select(".expand-info.tel")[0].get_text().replace("\n","")
# print tel
# "other J-other Hide"
# other = basic_info.select(".other .info")
# brand_alias = other[0].get_text().replace("\n","")
# print brand_alias
# working_time = "".join(other[1].get_text().strip().split("\n")[0:-1]).replace(" ","")
# print working_time
# geo_text = soup.find(id="aside").find("script").get_text()
# import re
# p = re.compile(r"(lng:.*,lat:.*)}").findall(geo_text)#[u'lng:121.47835,lat:31.22062']
# p = dict([(x.split(":")[0],x.split(":")[-1])for x in p[0].split(",")]) #{u'lat': u'31.22062', u'lng': u'121.47835'}
# print p
speciality = soup.select_one("#shop-tabs script")#.get_text()
# speciality = BeautifulSoup(speciality).find_all("a")
# print speciality.contents[3]
print speciality
print "ok"
# get_shop_info("http://www.dianping.com/shop/24730133")
def get_url():
test_url="http://www.dianping.com/search/keyword/1/0_星巴克/p100"
res = urllib2.urlopen(test_url)
soup = BeautifulSoup(res.read())
soup = soup.findAll("a", attrs={"data-hippo-type": "shop"})
print soup
# print get_url()
# test_url="http://www.dianping.com/shop/24730133"
# res = urllib2.urlopen(urllib2.Request(test_url, None, header))
# print res.read()
# print res.read()
# soup = BeautifulSoup(res.read())
# soup = soup.find_all("a")[4]
# print soup.attrs.get("href")
# print soup.get_text()
# print res.headers
# print res.header