pandas处理数据时内存溢出 kill掉python进程
自己写了个小爬虫,用于获取我需要的字段数据集,python运行脚本创建df数据集的时候服务器内存溢出kill掉了我脚本 然后尝试在循环内把非df的变量del后回收掉,df的size在被kill掉进程时也只有几M,请问是哪里出现了问题,怎么解决,怎么规避。
# -*- coding: UTF-8 -*-
import time
import requests
import pandas as pd
from lxml import etree
import sys
import gc
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'116.0.0.0 Safari/537.36',
}
df = pd.DataFrame(columns=['名称', '电话', '地址'])
for i in range(1, 4065):
url = f"https://*********{i}.html#/"
page_text = requests.get(url=url, headers=headers).content.decode('utf-8')
tree = etree.HTML(page_text)
res_name = tree.xpath(f'//h3//text()')
res_tel = tree.xpath(f'/html/body/*****/text()')[1:]
res_add = tree.xpath(f'/html/body/****/text()')
for res_num in range(len(res_name)):
df = df.append({'名称': res_name[res_num], '电话': res_tel[res_num], '地址': res_add[res_num]}, ignore_index=True)
del url, page_text, tree, res_name, res_tel, res_add
gc.collect()
size = sys.getsizeof(df)
print(f"The size of my_var is {size/1024/1024} MB.")
print(i)
time.sleep(0.5)
df.to_excel(f'./abcd.xlsx', index=False)