- 写这个的时候发现了一个小细节,不加headers的情况下,列表页里面一页的数据是20,加了正常的headers之后数据就和浏览器中浏览到的商品数一致了,为48。虽然页面内商品数量和浏览器观察的不一致,但是分类内商品总数没有变化,也就是页数变多了,为了效率更高一点,还是加上了headers。怀疑是wap页面,但是也没有找到网站的wap站,所以不了了之。
- 另外就是半夜1点多写好了之后,看起来没问题了跑起来了,早上起来发现报错,对方服务器积极拒绝,所以把延时0.5秒去掉了,加了个死循环,这样还靠谱一点。
- sqlite数据库我记得不需要完整路径,但是测试数据库插入的时候,报错为找不到表,改为绝对路径后错误消失,暂不清楚原因。
下面为执行截图
import requests
import csv
import time
import sqlite3
import re
from lxml import etree
class yami:
def __init__(self):
self.site = 'http://www.yamibuy.com/cn/'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
self.CONN = sqlite3.connect('F:\python\yamibuy\yami.db')
self.CUR = self.CONN.cursor()
self.main_loop()
#主循环
def main_loop(self):
#遍历分类
for cat in self.get_list():
print(cat)
#遍历分页
for page in range(1,555):
url = 'http://www.yamibuy.com/cn/{0}&brands=&brand_name_list=&sort_order=0&sort_by=3&page={1}'.format(cat[3],page)
data = self.list_page(cat,url,page)
if len(data) == 0:
break
self.goods_sql(data)
print(cat[:3], 'done')
#插入数据
def goods_sql(self,data):
for item in data:
sql = 'INSERT INTO "main"."yami_0228" ("name", "price", "url", "cat1", "cat2", "cat3", "brand", "brand_c", "weight", "comment") VALUES (\'{0}\');'.format("','".join(item))
self.CUR.execute(sql)
self.CONN.commit()
#获取列表页
def list_page(self, cat, url, page):
time.sleep(1)
html_str = requests.get(url,headers=self.headers).text
html = etree.HTML(html_str)
goods_list = html.xpath('//*[@id="itemContainer"]/div[1]/div/div/a[3]')
if len(goods_list) == 0:
return []
print('page', page, len(goods_list))
result = []
for goods in goods_list:
goods_url = self.site + goods.get('href')
goods_page = self.goods_page(goods_url)
result.append((
goods.xpath('div/p[1]/text()')[0].replace("'",'-'),
#goods_price
goods.xpath('div/p[2]/text()')[0],
goods_url
) + cat[:3] + goods_page
)
return result
#获取商品详情页
def goods_page(self,goods_url):
# print(goods_url)
# time.sleep(0.5)
flag = True
while flag:
try:
html_str = requests.get(goods_url,headers=self.headers).text
flag = False
except:
pass
html = etree.HTML(html_str)
comment = html.xpath('/html/body/div[1]/div[4]/div/div[2]/div[2]/a[1]/text()')
comment = self.check(comment,'0')
if comment != '0':
comment = re.findall('\d+',comment)[0]
brand = html.xpath('//div[@class="selling-points"]/p[1]/a/text()')
brand = self.check(brand,'')
brand = brand.replace("'",'-')
brand_c = html.xpath('//div[@class="selling-points"]/p[2]/text()')
brand_c = self.check(brand_c,'')
weight = html.xpath('//div[@class="selling-points"]/p[3]/text()')
weight = self.check(weight,'')
weight = weight.replace("'",'-')
return (brand,brand_c,weight,comment)
def check(self,value,preset):
if len(value) == 0:
return preset
else:
return value[0]
#获取三级分类列表
def get_list(self):
html_str = requests.get(self.site).text
html = etree.HTML(html_str)
cat_1_list = html.xpath('/html/body/div[1]/div[1]/div[3]/div/div[2]/div')
for cat_1 in cat_1_list:
cat_1_name = cat_1.xpath('div[1]/h2/text()')[0]
cat_2_list = cat_1.xpath('div[2]/div/div[1]/div')
for cat_2 in cat_2_list:
cat_2_name = cat_2.xpath('p/a/text()')[0]
cat_3_list = cat_2.xpath('ul/li/a')
for cat_3 in cat_3_list:
cat_3_name = cat_3.text
yield (cat_1_name,cat_2_name,cat_3_name,cat_3.get('href'))
if __name__ == '__main__':
yami()
CREATE TABLE "yami_0228" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"name" TEXT,
"price" TEXT,
"url" TEXT,
"cat1" TEXT,
"cat2" TEXT,
"cat3" TEXT,
"brand" TEXT,
"brand_c" TEXT,
"weight" TEXT,
"comment" TEXT
);
不常做手有点生了