博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python 抓取alexa数据
阅读量:6278 次
发布时间:2019-06-22

本文共 5683 字,大约阅读时间需要 18 分钟。

  要抓取http://www.alexa.cn/rank/baidu.com网站的排名信息:例如抓取以下信息:

  

   需要微信扫描登录

    因为这个网站抓取数据是收费,所以就利用网站提供API服务获取json信息:

  

  

  

  上面的API KEY值需要花钱买的(注意开通会员的方式不行,必须以10000次查询 49.00 元这种方式,比较坑爹啊

 

  具体python代码

  

# coding=utf-8import jsonimport httplib2import jsonimport xlrdimport xlwtimport osimport datetimeimport timeclass alexa:    def __init__(self,key="7Z4ddd6ywaQuo6RkKfI3SzGeKn8Mavde"):        self.key = key    def WriteLog(self, message,date):        fileName = os.path.join(os.getcwd(), 'alexa/' + date  +   '.txt')        with open(fileName, 'a') as f:            f.write(message)    def WriteSheetRow(self,sheet, rowValueList, rowIndex, isBold):        i = 0        style = xlwt.easyxf('font: bold 1')        # style = xlwt.easyxf('font: bold 0, color red;')#红色字体        style2 = xlwt.easyxf('pattern: pattern solid, fore_colour yellow; font: bold on;') # 设置Excel单元格的背景色为黄色,字体为粗体        for svalue in rowValueList:            if isBold:                sheet.write(rowIndex, i, svalue, style2)            else:                sheet.write(rowIndex, i, svalue)            i = i + 1    def save_Excel(self,headList,valuelist,fileName):        wbk = xlwt.Workbook()        sheet = wbk.add_sheet('sheet1', cell_overwrite_ok=True)        # headList = ['周期', '全球网站排名', '变化趋势', '日均UV']        rowIndex = 0        self.WriteSheetRow(sheet, headList, rowIndex, True)        for lst in valuelist:            rowIndex+=1            self.WriteSheetRow(sheet, lst, rowIndex, False)        wbk.save(fileName)    def getAlexaData(self,domain):        url="http://api.alexa.cn/alexa/details?site=%s&key=%s"%(domain,self.key)        try:            h = httplib2.Http(".cache")            (resp_headers, content) = h.request(url, "GET")            data = json.loads(content.decode('utf8'))            self.parserData(data)            # print(data)        except Exception as e1:            error = "ex"    def parserData(self,data):        # f = open("alexa.txt", "r")        # txt = f.read()        # data = json.loads(txt)        traffic_dict = data["result"]["traffic_data"]        day = traffic_dict["day"]        week = traffic_dict["week"]        month = traffic_dict["month"]        three_month = traffic_dict["three_month"]        trafic_headList = ['周期', '全球网站排名', '变化趋势', '日均UV', '日均PV']        traffic_data_list =[]        day_list = ["当日"]        week_list = ["周平均"]        month_list = ["月平均"]        three_month_list = ["三月平均"]        trafic = ["time_range", "traffic_rank", "traffic_rank_delta", "avg_daily_uv", "avg_daily_pv"]        length = len(trafic)        for i in range(1,length):            day_list.append(day[trafic[i]])            week_list.append(week[trafic[i]])            month_list.append(month[trafic[i]])            three_month_list.append(three_month[trafic[i]])        traffic_data_list.append(day_list)        traffic_data_list.append(week_list)        traffic_data_list.append(month_list)        traffic_data_list.append(three_month_list)        fileName = datetime.datetime.now().strftime('%Y-%m-%d')+"_traffic.xlsx"        fileName = os.path.join(os.getcwd(),fileName)        self.save_Excel(trafic_headList,traffic_data_list,fileName)        country_headList = ['国家/地区名称', '国家/地区代码', '国家/地区排名', '网站访问比例', '页面浏览比例']        country_data_list = []        country_data = data["result"]["country_data"]        col_list = ["country","code","rank","per_users","per_pageviews"]        length = len(col_list)        for item in country_data:            lst =[]            for i in range(0,length):                lst.append(item[col_list[i]])            country_data_list.append(lst)        fileName = datetime.datetime.now().strftime('%Y-%m-%d') + "_country.xlsx"        fileName = os.path.join(os.getcwd(), fileName)        self.save_Excel(country_headList, country_data_list, fileName)        subdomains_headList = ['被访问网址', '近月网站访问比例', '近月页面访问比例', '人均页面浏览量']        subdomains_data_list = []        subdomains_data = data["result"]["subdomains_data"]        sub_col_list = ["subdomain", "reach_percentage", "pageviews_percentage", "pageviews_peruser"]        length = len(sub_col_list)        for item in subdomains_data:            lst = []            for i in range(0, length):                lst.append(item[sub_col_list[i]])            subdomains_data_list.append(lst)        fileName = datetime.datetime.now().strftime('%Y-%m-%d') + "_subdomains.xlsx"        fileName = os.path.join(os.getcwd(), fileName)        self.save_Excel(subdomains_headList, subdomains_data_list, fileName)        # print(("%s,%s,%s,%s,%s") % (day[trafic[0]], day[trafic[1]], day[trafic[2]], day[trafic[3]], day[trafic[4]]))        # print(("%s,%s,%s,%s,%s") % (week[trafic[0]], week[trafic[1]], week[trafic[2]], week[trafic[3]], week[trafic[4]]))        # print(("%s,%s,%s,%s,%s") % (month[trafic[0]], month[trafic[1]], month[trafic[2]], month[trafic[3]], month[trafic[4]]))        # print(("%s,%s,%s,%s,%s") % (three_month[trafic[0]], three_month[trafic[1]], three_month[trafic[2]], three_month[trafic[3]], three_month[trafic[4]]))        # print("\n")        # print("country_data")        # country_data =  data["result"]["country_data"]        # for item in country_data:        #     print(("%s,%s,%s,%s,%s") % (item["country"], item["code"], item["rank"], item["per_users"], item["per_pageviews"]))        #        # print("\n")        # print("subdomains_data")        # subdomains_data = data["result"]["subdomains_data"]        # for item in subdomains_data:        #     print(("%s,%s,%s,%s") % (item["subdomain"], item["reach_percentage"], item["pageviews_percentage"], item["pageviews_peruser"]))obj = alexa()obj.getAlexaData("baidu.com")# obj.parserData("")
View Code

 

  

 

转载地址:http://ejgpa.baihongyu.com/

你可能感兴趣的文章
C++类设计的一些心得
查看>>
tableVIew删除时的delete按钮被挡住时重写的方法
查看>>
读cookie中文字符乱码问题
查看>>
招募译者翻译并发数据结构
查看>>
普通表转换为分区表
查看>>
Java 容器 & 泛型:三、HashSet,TreeSet 和 LinkedHashSet比较
查看>>
性能优化总结(六):预加载、聚合SQL应用实例
查看>>
Drill官网文档翻译四 Drill的性能
查看>>
一步一步教你用PHP+MySql搭建网站 No.1 主页&数据库连接
查看>>
JAVA网络编程之Socket
查看>>
翻翻git之---偏向iOS风格的Switch ToggleSwitch
查看>>
Python 全栈开发 -- 开发环境篇
查看>>
python dict type like json
查看>>
颠覆大数据分析之Spark VS分布式共享内存系统
查看>>
深入理解 Android 控件
查看>>
安卓版手机app登录后在后台运行固定时间和被杀死后固定时间重启后重新登录...
查看>>
手把手教你用Hexo+Github 搭建属于自己的博客
查看>>
http缓存知识
查看>>
Go 时间交并集小工具
查看>>
iOS 多线程总结
查看>>