金融行业也能用到Python?这就是Python之所以能这么火的原因吧!

综合编程 2018-06-20 阅读原文

此处介绍文件夹,下面,我们对这6大模块进行详细的介绍。

一、URL下载器

URL下载器包含两步,首先下载网站左侧导航栏的URL,然后通过导航栏的URL获取每个子栏目包含的链接列表。

下面是获取左侧导航栏所有链接并生成导航文件的代码

# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
import os
class get_catalog(object):
 '''生成和操作导航文件'''
 def save_catalog(self):
 '''获得证券之星左侧自导航的内容和网址并保存'''
 #获取网页内容
 url = 'http://quote.stockstar.com'
 request =urllib.request.Request(url = url)
 response = urllib.request.urlopen(request)
 content = response.read().decode('gbk')
 #截取左侧导航内容
 soup = BeautifulSoup(content,"lxml")
 soup = BeautifulSoup(str(soup.find_all('div',class_ = "subMenuBox")),"lxml")
 #初始化一级子目录和二级子目录的数据框
 catalog1 = pd.DataFrame(columns = ["cata1","cata2","url2"])
 catalog2 = pd.DataFrame(columns = ["url2","cata3","url3"])
 #整理目录内容和其对应的链接
 index1 = 0;index2 = 0
 for content1 in soup.find_all('div',class_ = re.compile("list submenu?")):
 cata1 = re.findall('>(.*?)(.*?)<',str(content2.dt.a).replace('rn',''))
 url2 = url + content2.dt.a['href']
 catalog1.loc[index1] = {'cata1':cata1[0],'cata2':cata2[0].split()[0],'url2':url2}
 index1 += 1
 for content3 in content2.find_all('li'):
 cata3 = re.findall('·(.*?)<',str(content3.a))
 url3 = url + content3.a['href']
 catalog2.loc[index2] = {'url2':url2,'cata3':cata3[0],'url3':url3}
 index2 += 1
 #对一级子目录表和二级子目录表做表连接并保存
 catalog = pd.merge(catalog1,catalog2,on='url2',how='left')
 catalog.to_csv('catalog.csv')
 
 def load_catalog(self):
 '''判断导航文件是否存在并载入'''
 if 'catalog.csv' not in os.listdir():
 self.save_catalog()
 print('网址导航文件已生成')
 else:
 print('网址导航文件已存在')
 catalog = pd.read_csv('catalog.csv',encoding='gbk',usecols=range(1,6))
 print("网址导航文件已载入")
 return(catalog)

 def index_info(self,catalog,index):
 '''创建每行的行名,作为存入数据库的表名,并获取每行终端的网址链接'''
 if str(catalog.loc[index]['cata3'])=='nan':
 table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2']
 url = catalog.loc[index]['url2']
 else:
 #+、()等符号不能作为数据库表名,得替换或剔除
 if '+' in catalog.loc[index]['cata3']:
 cata3 = catalog.loc[index]['cata3'].replace('+','')
 table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
 elif '(' in catalog.loc[index]['cata3']:
 cata3 = catalog.loc[index]['cata3'].replace('(','').replace(')','')
 table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
 else:
 table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + catalog.loc[index]['cata3']
 url = catalog.loc[index]['url3']
 return(table_name,url)

get_catalog

下面是获取每个子栏目所有链接的代码

import pandas as pd
from selenium import webdriver
import time
import re
import math
from get_catalog import get_catalog
class get_urls(object):
 '''获取每个栏目的链接列表'''
 def __init__(self,browser,url):
 self.browser = browser #浏览器对象
 self.url = url #待爬取的URL
 
 def get_browser(self):
 '''连接URL'''
 state = 0
 test = 0
 while state == 0 and test < 5:
 try:
 self.browser.get(self.url)
 state = 1
 print('成功连接 %s'%self.url)
 except:
 test += 1

 def get_element(self):
 '''获取翻页相关按钮的链接列表'''
 self.get_browser()
 element_list=[]
 for i in range(1,8):
 try: 
 element = self.browser.find_element_by_xpath('//*[@id="divPageControl1"]/a[%d]'%i).get_attribute('href')
 element_list.append(element)
 except:
 time.sleep(0.2)
 return(element_list)

 def get_urllist(self):
 '''通过翻页相关按钮生成有效的页码链接列表'''
 element_list = self.get_element()
 if len(element_list)<=1:
 urls = [self.url]
 else:
 try:
 max_number = re.search('_(d*).',element_list[len(element_list)-3])
 begin = max_number.start() + 1
 end = max_number.end() - 1
 int_max_number = int(element_list[len(element_list)-3][begin:end])
 urls = []
 for i in range(1,int_max_number + 1):
 url = element_list[len(element_list)-3][:begin] + str(i) + element_list[len(element_list)-3][end:]
 urls.append(url)
 except:
 urls = [self.url]
 return(urls)

get_urls
# coding:utf - 8
class UrlManager(object):
 '''URL管理器'''
 def __init__(self):
 self.new_urls = set() #未爬取URL集合
 self.old_urls = set() #已爬取URL
 def has_new_url(self):
 '''判断是否有未爬取的URL'''
 return(self.new_url_size()!=0)
 def get_new_url(self):
 '''获取一个未爬取的URL'''
 new_url = self.new_urls.pop()
 self.old_urls.add(new_url)
 return(new_url)
 def add_new_url(self,url):
 '''将新的URL添加到未爬取的URL集合中'''
 if url is None:
 return
 if url not in self.new_urls and url not in self.old_urls:
 self.new_urls.add(url)
 def add_new_urls(self,urls):
 '''将新的URL列表添加到未爬取的URL集合中'''
 if urls is None or len(urls)==0:
 return
 for url in urls:
 self.add_new_url(url)
 def new_url_size(self):
 '''获取为爬取URL集合的大小'''
 return(len(self.new_urls))

UrlManager

三、HTML下载器

HTML下载器用来下载网页,这时候需要注意网页的编码,已保证下载的网页没有乱码。

获取网页内容时可能会遇到IP被封的情况,所以我们得爬取一个代理IP池,供HTML下载器使用。

下面是获取代理IP池的代码

import urllib.request
import re
import time
import random
import socket
import threading

class proxy_ip(object):
 '''获取有效代理IP并保存'''
 def __init__(self,url,total_page):
 self.url = url #打算爬取的网址
 self.total_page = total_page #遍历代理IP网页的页数
 
 def get_proxys(self):
 '''抓取代理IP'''
 user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
 ip_totle=[]
 for page in range(1,self.total_page+1):
 #url = 'http://www.httpsdaili.com/?page='+str(page)
 #url='http://www.kuaidaili.com/free/inha/'+str(page)+'/'
 url='http://www.xicidaili.com/nn/'+str(page) #西刺代理
 headers={"User-Agent":random.choice(user_agent)}
 try:
 request=urllib.request.Request(url=url,headers=headers)
 response=urllib.request.urlopen(request)
 content=response.read().decode('utf-8')
 print('get page',page)
 pattern=re.compile('(d.*?)') #截取与之间第一个数为数字的内容
 ip_page=re.findall(pattern,str(content))
 ip_totle.extend(ip_page)
 except Exception as e:
 print(e)
 time.sleep(random.choice(range(1,5)))
 #打印抓取内容
 print('代理IP地址 ','t','端口','t','速度','t','验证时间')
 for i in range(0,len(ip_totle),4):
 print(ip_totle[i],' ','t',ip_totle[i+1],'t',ip_totle[i+2],'t',ip_totle[i+3])
 #整理代理IP格式
 proxys = []
 for i in range(0,len(ip_totle),4):
 proxy_host = ip_totle[i]+':'+ip_totle[i+1]
 proxy_temp = {"http":proxy_host}
 proxys.append(proxy_temp)
 return(proxys)

 def test(self,lock,proxys,i,f):
 '''验证代理IP有效性'''
 socket.setdefaulttimeout(15) #设置全局超时时间
 url = self.url 
 try:
 proxy_support = urllib.request.ProxyHandler(proxys[i])
 opener = urllib.request.build_opener(proxy_support)
 opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")]
 urllib.request.install_opener(opener)
 #res = urllib.request.urlopen(url).read().decode('gbk')
 res = urllib.request.urlopen(url).read().decode('utf-8')
 print(res)
 lock.acquire() #获得锁
 print(proxys[i],'is OK')
 f.write('%sn' %str(proxys[i])) #写入该代理IP
 lock.release() #释放锁
 except Exception as e:
 lock.acquire()
 print(proxys[i],e)
 lock.release()
 
 def get_ip(self):
 '''多线程验证'''
 f = open('proxy_ip.txt','a+') #新建一个储存有效IP的文档
 lock=threading.Lock() #建立一个锁
 #多线程验证
 proxys = self.get_proxys()
 threads=[]
 for i in range(len(proxys)):
 thread=threading.Thread(target=self.test,args=[lock,proxys,i,f])
 threads.append(thread)
 thread.start()
 #阻塞主进程,等待所有子线程结束
 for thread in threads:
 thread.join() 
 f.close() #关闭文件

get_proxy_ip

下面是HTML下载器模块的代码

# _*_ coding:utf-8 _*_
from firstSpider.get_proxy_ip import proxy_ip
import urllib.request
import random
import os
import socket
import time
import re
class HtmlDownloader(object):
 '''获取网页内容'''
 def download(self,url):
 user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
 state = 0;test = 0
 socket.setdefaulttimeout(20) #设置全局超时时间
 while state == 0 and test < 5:
 try:
 request = urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素
 response = urllib.request.urlopen(request)
 readhtml = response.read()
 content = readhtml.decode('gbk') #读取网页内容
 time.sleep(random.randrange(1,6))
 if re.search('Auth Result',content) == None:
 state = 1
 except Exception as e:
 print('系统IP获取网页失败','',e)
 if 'proxy_ip.txt' not in os.listdir() or os.path.getsize('proxy_ip.txt') == 0:
 print('代理IP池不存在,新建代理IP池')
 pool = proxy_ip(url,5)
 pool.get_ip()
 print('代理IP池创建完毕')
 else:
 f = open('proxy_ip.txt','r')
 proxys_ip = f.readlines()
 f.close()
 random.shuffle(proxys_ip)
 for i in range(len(proxys_ip)):
 try:
 proxy_support = urllib.request.ProxyHandler(eval(proxys_ip[i][:-1]))
 opener = urllib.request.build_opener(proxy_support)
 opener.addheaders=[("User-Agent",random.choice(user_agent))]
 urllib.request.install_opener(opener)
 response = urllib.request.urlopen(url)
 readhtml = response.read()
 content = readhtml.decode('gbk')
 time.sleep(random.randrange(1,6))
 if re.search('Auth Result',content) == None: #排除被判别为无效用户的情况
 state = 1
 print('成功接入代理IP',proxys_ip[i])
 break
 except Exception as e:
 print(proxys_ip[i],'请求失败',e)
 except urllib.error.HTTPError as e: 
 print(proxys_ip[i],'请求失败',e.code)
 except urllib.error.URLError as e:
 print(proxys_ip[i],'请求失败',e.reason)
 try:
 if i == len(proxys_ip)-1:
 os.remove('proxy_ip.txt')
 print('代理IP池失效,已删除')
 except: #i不存在的情况
 os.remove('proxy_ip.txt')
 print('代理IP池为空,文件已删除')
 time.sleep(60)
 test += 1
 if test == 5:
 print('未成功获取 %s 页面内容'%url)
 content = None
 return(content)

HtmlDownloader

四、HTML解析器

HTML解析器主要对HTML下载器下载的网页内容进行解析,提取想要的内容。

本文用到的网页解析方法主要是正则表达式和BeautifulSoup,下面是HTML解析器的代码

# coding:utf-8
import re
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import numpy as np
import time
import datetime
class HtmlParser(object):
 '''解析网页内容'''
 def __init__(self,content):
 self.soup = BeautifulSoup(content,"lxml") #待解析内容
 
 def get_header(self):
 '''获取表格标题'''
 try:
 header = []
 for tag in self.soup.thead.find_all('td'):
 title = str(tag)
 title = title.replace(' ','')
 title = title.replace('n','')
 header.extend(re.findall('>(.*?)= 6 or datalist_len == 0: #排除了标题格式不统一和没数据的两种情况
 header_name = []
 return(header_name)

 def get_header2(self):
 '''获取表格标题(标题存在两层)'''
 stati_date = []
 for date in self.soup.thead.find_all('td',class_ = "double align_center"):
 stati_date.extend(re.findall('>(.*?)<',str(date)))
 header_total = self.get_header()
 header_name = header_total[:-5]
 header_name = header_name[:2] + header_total[-5:-1] + header_name[2:]
 if stati_date[0] in header_name:
 header_name.remove(stati_date[0])
 if stati_date[1] in header_name:
 header_name.remove(stati_date[1])
 header_name.append('三四列统计时间')
 header_name.append('五六列统计时间')
 header_name.append('数据时间')
 return(header_name,stati_date)
 
 def get_datatime(self):
 '''获取数据时间'''
 try:
 date = re.findall('数据时间:(.*?)= 1:
 soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist")[0]),"lxml")
 elif len(self.soup.find_all('tbody',id="datalist1")) >= 1:
 soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist1")[0]),"lxml")
 else:
 soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist2")[0]),"lxml")
 date = self.get_datatime()
 row = len(soup.tbody.find_all('tr'))
 #初始化正常标题和双重标题时的数组
 if len(self.soup.thead.find_all('td',class_ = "double align_center")) == 0:
 header_name = self.get_header()
 col = len(header_name)
 datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
 flag = 1
 else:
 header_name = self.get_header2()[0]
 col = len(header_name)
 datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
 flag = 2 
 for i in range(row): #提取数据并写入数组
 detail = re.findall('>(.*?)<',str(soup.find_all('tr')[i]))
 for blank in range(detail.count('')):
 detail.remove("")
 try:
 if flag == 1:
 detail.append(date)
 datalist[i] = detail
 elif flag == 2:
 stati_date = self.get_header2()[1]
 detail.append(stati_date[0])
 detail.append(stati_date[1])
 detail.append(date)
 datalist[i] = detail 
 except:
 datalist[i][0] = detail[0]
 datalist[i][col-1] = date
 return(datalist,header_name)
 
 def get_dataframe(self):
 '''组合标题和数据数据为数据框并输出'''
 datalist,header_name = self.get_datalist()
 table = pd.DataFrame(datalist ,columns = header_name)
 return(table)

HtmlParser

六、爬虫调度器

爬虫调度器主要将上述几个模块组合起来,合理的分工,高效完成任务。

爬虫调度器采用进程池的方式加快了程序执行的效率,下面是爬虫调度器模块的代码

from firstSpider.UrlManager import UrlManager
from firstSpider.HtmlDownloader import HtmlDownloader
from firstSpider.HtmlParser import HtmlParser
from firstSpider.DataOutput import DataOutput
from sqlalchemy import create_engine
import threadpool,time
 
class SpiderMan(object):
 '''爬虫机器人'''
 def __init__(self,engine,table_name):
 self.engine = engine #数据库连接引擎
 self.table_name = table_name #表名
 self.manager = UrlManager() #URL管理器
 self.downloader = HtmlDownloader() #HTML下载器

 def spider(self,url):
 '''单网页爬虫组件'''
 # HTML下载器下载网页
 html = self.downloader.download(url)
 f = open('stock.txt','w')
 f.write(html)
 f.close()
 # HTML解析器抽取网页数据
 parser = HtmlParser(html)
 if len(parser.get_header()) > 0:
 data = parser.get_dataframe()
 # 数据储存器储存文件
 out = DataOutput(self.engine,data,self.table_name)
 out.output()
 print('%s 的数据已存入表 %s'%(url,self.table_name))
 time.sleep(1)
 return(parser.get_datatime())
 
 def crawl(self,urls):
 '''爬取一个栏目连接列表的内容'''
 self.manager.add_new_urls(urls)
 # 判断url管理器中是否有新的url
 pool = threadpool.ThreadPool(10)
 while(self.manager.has_new_url()):
 # 从URL管理器获取新的url
 new_url = self.manager.get_new_url()
 requests = threadpool.makeRequests(self.spider,(new_url,))
 pool.putRequest(requests[0])
 pool.wait()

SpiderMan

将上述每个模块的代码都新建一个py文件放在firstSpider文件夹下,并运行如下主程序即可获取证券之星全站的股票数据

from firstSpider.get_proxy_ip import proxy_ip
from firstSpider.get_catalog import get_catalog
from firstSpider.get_urls import get_urls
from firstSpider.SpiderMan import SpiderMan
from selenium import webdriver
from sqlalchemy import create_engine
import time

'''根据左侧子导航下载证券之星当天所有数据'''
if __name__ == "__main__":
 print('获取代理IP并验证有效性')
 ip_pool = proxy_ip('http://quote.stockstar.com',8)
 ip_pool.get_ip()
 print('代理IP池建立完毕')
 getcata = get_catalog()
 catalog = getcata.load_catalog()
 start = 0
 end = len(catalog)
 catalog = catalog[start : end]
 print('初始化浏览器')
 browser = webdriver.Chrome()
 engine = create_engine('mysql+pymysql://root:Jwd116875@localhost:3306/scott?charset=utf8')
 for index in range(start,end):
 table_name,url = getcata.index_info(catalog,index)
 stop_url = ['http://quote.stockstar.com/gold/globalcurrency.shtml'] #想过滤掉的网页链接
 if url not in stop_url:
 geturls = get_urls(browser,url)
 urls = geturls.get_urllist()
 print('已获取 %s 的链接列表'%table_name)
 Spider_man = SpiderMan(engine,table_name)
 Spider_man.crawl(urls)
 datatime = Spider_man.spider(urls[0])
 print('%s: %s 栏目 %s 的增量数据爬取完毕'%(index,table_name,datatime))

main

麻雀虽小五脏俱全,以上是用简单的爬虫框架实现的一次全站内容爬取,在执行速度和程序伪装上还有很大提升空间,希望能够与大家一同交流成长。

进群:125240963 即可获取数十篇的PDF哦!

Python_博客园

责编内容by:Python_博客园阅读原文】。感谢您的支持!

您可能感兴趣的

Python下定时任务框架APScheduler的使用 1.APScheduler简介: APScheduler是Python的一个定时任务框架,可以很方便的满足用户定时执行或者周期执行任务的需求,它提供了基于日期date、固定时间间隔interval 、以及类似于Linux上的定时任务cr...
Python标准库笔记(9) — functools模块 functools 作用于函数的函数 functools 模块提供用于调整或扩展函数和其他可调用对象的工具,而无需完全重写它们。 装饰器 partial 类是 functools 模块提供的主...
How Pol Brigneti got a Data Analyst job using Data... When Pol Brigneti started his career after studying Business Management at Pompeu Fabra University in Barcelona, he had ...
Basic Statistics in Python: Probability When studying statistics, you will inevitably have to learn about probability. It is easy lose yourself in the formulas...
翻译 | 更快的Python(一) 更快的Python(Python Faster Way)使用代码示例来说明如何书写Python代码能带来更高的性能。本文对代码进行了讲解,从性能和可读性等角度来选择出最适合的写法。 例子1:字符串格式化 ...