大众点评spyder - 百里求一的博客

# encoding=utf-8
import re
import urllib2
from math import *
from bs4 import BeautifulSoup

# 请求头部信息
header = {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
# {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}

#测试brand_name
brand_name=["星巴克","麦当劳","肯德基","小南国","真功夫","汉拿山","新辣道","新疆火宴山","味千拉面"]

print city_code.split("\n")

class ShopBindHandle(object):

    def __init__(self,base_page_url="http://www.dianping.com/search/keyword/{}/0_{}/p{}",city=u"北京",wifi_id=None,brand_name=None):
        self.base_page_url = base_page_url
        self.city=city
        self.wifi_id = wifi_id
        self.city_code = self.get_city_code()
        self.brand_name=brand_name

    #参数为两个字典，比如：{"longitude":111,"latitude":322},{"longitude":115551,"latitude":322333}
    def calc_distance(self,geo1,geo2):
        Lat_A, Lng_A, Lat_B, Lng_B = geo1.get("latitude"),geo1.get("latitude"), geo2.get("latitude"),geo2.get("latitude")
        ra = 6378.140  # 赤道半径 (km)
        rb = 6356.755  # 极半径 (km)
        flatten = (ra - rb) / ra  # 地球扁率
        rad_lat_A = radians(Lat_A)
        rad_lng_A = radians(Lng_A)
        rad_lat_B = radians(Lat_B)
        rad_lng_B = radians(Lng_B)
        pA = atan(rb / ra * tan(rad_lat_A))
        pB = atan(rb / ra * tan(rad_lat_B))
        xx = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(rad_lng_A - rad_lng_B))
        c1 = (sin(xx) - xx) * (sin(pA) + sin(pB)) ** 2 / cos(xx / 2) ** 2
        c2 = (sin(xx) + xx) * (sin(pA) - sin(pB)) ** 2 / sin(xx / 2) ** 2
        dr = flatten / 8 * (c1 - c2)
        distance = ra * (xx + dr)
        return distance


    #得到城市对应的编号，如：(上海,1)
    def get_city_code(self):
        pass


    # 得到所有大众点评上面所有的brand_name相关的连接
    def get_brand_urls(self,brand_name):
        try:
            for i in xrange(1,1000):
                res = urllib2.urlopen(urllib2.Request(self.base_page_url.format(self.city_code,brand_name,i), None, header), timeout=10)
                for brand_shop_url in BeautifulSoup(res).findAll("a", attrs={"data-hippo-type": "shop"}):
                    if brand_name in brand_shop_url.get_text():
                        yield "http://www.dianping.com"+brand_shop_url.get("href")
        except:
            return

    # 得到大众点评上面对应的商店的id和geo
    def get_shop_id(self,shop_url):
        soup = BeautifulSoup(urllib2.urlopen(shop_url))
        _geo_text = soup.select_one("#aside script").get_text()
        _p = re.compile(r"(lng:.*,lat:.*)}").findall(_geo_text)[0]#[u'lng:121.47835,lat:31.22062']
        return {"shop_id":shop_url.split("/")[-1],
                "geo":{"longitude":_p.split(",")[0].split(":")[-1],"latitude":_p.split(",")[1].split(":")[-1]}
               }
    # print get_shop_id("http://www.dianping.com/shop/22680632")

    def get_all_shops(brand_name):
        for brand_url in get_brand_urls(brand_name):
            if not brand_url:
                continue
            yield get_shop_id(brand_url)

    # 得到唯一的shop_id
    def get_bind(self,brand_name,geo):
        dis0=-1
        shop0=None
        for shop in self.get_all_shops(brand_name):
            dis = self.calc_distance(shop.geo,geo)
            if dis > dis0:
                shop0=shop
                dis0 = dis
        return shop0.shop_id,self.wifi_id

def get_shop_info(shop_url):
    res = urllib2.urlopen(shop_url)
    soup = BeautifulSoup(res.read())
#     basic_info = soup.find(id="basic-info")

#     brand_name = basic_info.h1.get_text().strip().split("\n")[0]
#     print brand_name

#     brief_info = basic_info.select(".brief-info span")

#     star = brief_info[0].attrs.get("title")
#     print star

#     judge =  brief_info[1].get_text()
#     print judge

#     avg = brief_info[2].get_text()
#     print avg

#     taste = brief_info[3].get_text()
#     print taste

#     env = brief_info[4].get_text()
#     print env

#     serve = brief_info[5].get_text()
#     print serve

#     address = basic_info.select(".expand-info.address")[0].get_text().replace("\n","").replace(" ","")
#     print address

#     tel = basic_info.select(".expand-info.tel")[0].get_text().replace("\n","")
#     print tel

#     "other J-other Hide"
#     other = basic_info.select(".other .info")

#     brand_alias = other[0].get_text().replace("\n","")
#     print brand_alias

#     working_time = "".join(other[1].get_text().strip().split("\n")[0:-1]).replace(" ","")
#     print working_time

#     geo_text = soup.find(id="aside").find("script").get_text()
#     import re
#     p = re.compile(r"(lng:.*,lat:.*)}").findall(geo_text)#[u'lng:121.47835,lat:31.22062']
#     p = dict([(x.split(":")[0],x.split(":")[-1])for x in p[0].split(",")]) #{u'lat': u'31.22062', u'lng': u'121.47835'}
#     print p
    speciality = soup.select_one("#shop-tabs script")#.get_text()
#     speciality = BeautifulSoup(speciality).find_all("a")
#     print speciality.contents[3]
    print speciality
    print "ok"

# get_shop_info("http://www.dianping.com/shop/24730133")










def get_url():
    test_url="http://www.dianping.com/search/keyword/1/0_星巴克/p100"
    res = urllib2.urlopen(test_url)
    soup = BeautifulSoup(res.read())
    soup = soup.findAll("a", attrs={"data-hippo-type": "shop"})

    print soup
# print get_url()
# test_url="http://www.dianping.com/shop/24730133"
# res = urllib2.urlopen(urllib2.Request(test_url, None, header))
# print res.read()
# print res.read()
# soup = BeautifulSoup(res.read())
# soup = soup.find_all("a")[4]
# print soup.attrs.get("href")
# print soup.get_text()
# print res.headers
# print res.header