python 爬虫,爬取扇贝单词网单词

对于python爬虫并不是很熟悉,遇到下面的问题,希望知道的朋友可以给个解决问题的方向!
需要爬出扇贝单词网里面单词书的单词,现在对于没有目录的单词书可以爬出来,如果有一层目录就没办法了 需要手动到这层目录去,输入这层目录的url 才能把这个目录下的单词爬取出来。
下面是我的代码:

from bs4 import BeautifulSoup
from lxml import html
import xml
import re
import requests

file = open("vocabulay.txt", "w")

''' file = open("out.txt", "w")  '''

pattern='<strong>([a-z,A-Z]*?)</strong>'

def spider(url):
    f = requests.get(url)
    soup = BeautifulSoup(f.content, "lxml")
    word_list=soup.select('strong')
    for word in word_list:
        word=str(word)
        word=re.findall(pattern,word)
        if(len(word)!=0):       #需要对list进行长度判断,否则访问word[0]会有问题
            print(word[0])
            file.writelines((word[0],"\n"))

url_list = [ "https://www.shanbay.com/wordlist/80770/87931/",
                "https://www.shanbay.com/wordlist/80770/89734/"
              ]

unit = 1

for url in url_list:
    file.write("\n#章节"+str(unit)+"\n")
    unit+=1
    for i in range(1,11):
        url1=url+"?page="+str(i)
        spider(url1)

file.close()

单词书的URL:https://www.shanbay.com/wordlist/80770

讨论数量: 3
pardon110
4年前 评论
from bs4 import BeautifulSoup
from collections import OrderedDict
import requests
import math
import sys

if len(sys.argv) == 2:
    bookUrl = sys.argv[1]
else:
    bookUrl = 'https://www.shanbay.com/wordbook/172933/'

baseUrl = 'https://www.shanbay.com'
head = {'headers':'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0'}

def getBookName(bookUrl):
    return bookUrl.split('/')[-2]

def getSoup(url):
    rsp = requests.get(url,headers=head,timeout=2)
    if rsp.status_code == 200:
        return BeautifulSoup(rsp.text)
    return None

def getChapter(Soup):
    chapterList = []
    for i in Soup.select('.wordbook-wordlist-name>a'):
        chapterList.append(i['href'])
    return chapterList

def mixChapterUrl(baseUrl,chapterList):
    mixChapterList = []
    for i in chapterList:
        mixChapterList.append(baseUrl+i)
    return mixChapterList

def mixChapterPageUrl(mixChapterList):
    mixChapterPageList = []
    for i in  mixChapterList:
        tmpChapterList = []
        soup = getSoup(i)
        number = int(soup.select('#wordlist-num-vocab')[0].string)
        pages = math.ceil(int(number)/20)
        for j in range(1,pages+1):
            tmpChapterList.append("{}?page={}".format(i,j))
        mixChapterPageList.append(tmpChapterList)
    return mixChapterPageList

def getWord(url):
    soup = getSoup(url)
    word = []
    translate = []
    wordDict = OrderedDict()
    for i in soup.select('.span2>strong'):
        word.append(i.string)
    for i in soup.select('.span10'):
        translate.append(i.string.replace('\n',' '))
    for i in range(len(word)):
        wordDict[word[i]] = translate[i]
    return wordDict

if __name__ == '__main__':

    name = getBookName(bookUrl)
    soup = getSoup(bookUrl)
    chapter = getChapter(soup)
    mixChapterList = mixChapterUrl(baseUrl,chapter)
    word = []
    for i in mixChapterPageUrl(mixChapterList):
        print(len(i))
        for j in i:
            word.append(getWord(j))

    with open(name+'.txt','w') as f:
        for i in word:
            for j in i.keys():
                f.write('[%-20s]\t[%s]\n'%(j,i[j]))
4年前 评论

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!