每天一个爬虫-learnku我的博客个人数据

SilenceHL 的个人博客 / 0 / 0 / 创建于 4年前 / 更新于 4年前

前言

每天写一个爬虫练练手，今天就爬learnku的个人相关数据吧

代码

import requests
from lxml import etree
import re
import chardet

url = 'https://learnku.com/blog/SilenceHL'
response = requests.get(url).content.decode()

result = etree.HTML(response)
name = re.search('\w+',result.xpath('//div [@class = "header"]/a[1]/text()')[0]).group()
post_count = re.search('\d+', str(result.xpath('//div [@class = "ui four statistics"]/div[1]/div[2]/text()')[0])
                       ).group()
fans_count = re.search('\d+', str(result.xpath('//div [@class = "ui four statistics"]/div[2]/div[2]/text()')[0])
                       ).group()
likes_count = re.search('\d+', str(result.xpath('//div [@class = "ui four statistics"]/div[3]/div[2]/text()')[0])
                        ).group()
favorites_count = re.search('\d+', str(result.xpath('//div [@class = "ui four statistics"]/div[4]/div[2]/text()')[0])
                            ).group()
rank = re.search('\d+', str(result.xpath('//div [@class = "ui two column grid text-center"]/div[1]//text()')[0])
                 ).group()
access = re.search('\d+', str(result.xpath('//div [@class = "ui two column grid text-center"]/div[2]//text()')[0])
                   ).group()
print('{}的博客，博文为：{}篇，粉丝数为：{}人，获得喜欢的数量为：{}，获得收藏的数量为：{}'.format(name, post_count, fans_count, likes_count, favorites_count,
                                                             rank, access))