用 BeautifulSoup 爬数据

Install

sudo apt-get install python3-bs4 pip install BeautifulSoup

Code

import urllib.request

pip install BeautifulSoup4

from bs4 import BeautifulSoup
import re
from math import ceil
import time

爬取详细企业信息 'http://b2b.huangye88.com/gongsi/3922373/co...'

def qiyeinfo(picurl):
time.sleep(1)
info = {}
qiyeid = picurl.split('/')[-2]
picurl = picurl + 'company_detail.html'
useragent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
headers = {'User-Agent': useragent}
req = urllib.request.Request(picurl, headers=headers)
html1 = urllib.request.urlopen(req, timeout=5)
bsObj = BeautifulSoup(html1, 'html.parser', from_encoding='gb18030')
html1.close()
try:
qiyeinfo = bsObj.find('div', {'class': 'data'})
tel = bsObj.find('div', {'class': 'telephone'}).get_text()
qiyename = qiyeinfo.p.get_text()
contactsname = bsObj.findAll('div', {'class': 'l-content'})[1].a.get_text()
with open(r'F:\test.txt', 'a+') as f:
f.write('企业url: ' + picurl + '\n')
f.write('企业名称:' + qiyename + '\n')
f.write('联系人:' + contactsname + '\n')
f.write('手机: ' + tel + '\n')
for i in qiyeinfo.find('ul').findAll('li'):
f.write(i.get_text() + '\n')
f.write('\n')
except:
pass

爬取所有企业列表

def qiyelist(picurl):
useragent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
headers = {'User-Agent': useragent}
req = urllib.request.Request(picurl, headers=headers)
html = urllib.request.urlopen(req, timeout=10)
bsObj = BeautifulSoup(html, 'html.parser', from_encoding='gb18030')
html.close()
listnum = bsObj.find('div', {'class': 'tit tit2'}).em.get_text()
a = int(listnum) / len(bsObj.findAll('h4'))
for i in range(15, 25):
listurl = '%s/pn%s' % (picurl, i)
req = urllib.request.Request(listurl, headers=headers)
html = urllib.request.urlopen(req, timeout=5)
bsObj = BeautifulSoup(html, 'html.parser', from_encoding='gb18030')
html.close()
for i in bsObj.findAll('h4'):
qiyeurl = i.a.attrs['href']
qiyeinfo(qiyeurl)

if name == 'main':
qiyelist('http://b2b.huangye88.com/jiangxi/food')

本作品采用《CC 协议》,转载必须注明作者和本文链接
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!
未填写
文章
1
粉丝
0
喜欢
0
收藏
0
排名:443
访问:2.1 万
私信
所有博文
社区赞助商