哒哒哒社区

python爬虫使用urllib + 正则


  • administrators

    from urllib import  request
    import re
    class Spider():
        url = 'https://www.panda.tv/cate/dota2'
        root_pattern = r'<div class="video-info">([\s\S]*?)</div>'
        title_pattern = r'</i>([\s\S]*?)</span>'
        number_pattern = r'<span class="video-number">([\s\S]*?)</span>'
        def __fetch_content(self):
            r = request.urlopen(self.url)
            htmls = r.read()
            htmls = str(htmls, encoding = 'utf-8')
            return htmls
        
        def __analysis(self, htmls):
            total = []
            root_html = re.findall(self.root_pattern, htmls)
            for i in range(len(root_html)):
                title = re.findall(self.title_pattern, root_html[i])
                number = re.findall(self.number_pattern, root_html[i])
                live_room =  {'title':title, 'number':number}
                total.append(live_room)
            return total
    
        def __refine(self, total):
            l = lambda each_item: {'title':each_item['title'][0].strip(), 'number':each_item['number'][0].strip()}
            return map(l, total)
    
        def __sort(self, refine_total):
            refine_total = sorted(refine_total, key = self.__sort_seed, reverse = True)
            return refine_total
    
        def __sort_seed(self,each_tiem):
            r = re.findall('\d*',each_tiem['number'])
            number = float(r[0])
            if '万' in each_tiem['number']:
                number *= 10000
            return number
    
        def __show(self, refine_total):
            for i in refine_total:
                print(i['title'] + '---' + i['number'])
    
        def start(self):
            htmls = self.__fetch_content()
            total = self.__analysis(htmls)
            refine_total = list(self.__refine(total))
            refine_total = self.__sort(refine_total)
            self.__show(refine_total)
    
    spider = Spider()
    spider.start()
    
    
    

登录后回复
 

社区之外

友情链接

鲁ICP备16014031号-2 ©2017 版权所有 xlelou  

与 哒哒哒 的连接断开,我们正在尝试重连,请耐心等待