| selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 |
| |
| selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器 |
| |
| from selenium import webdriver |
| browser=webdriver.Chrome() |
| browser=webdriver.Firefox() |
| browser=webdriver.PhantomJS() |
| browser=webdriver.Safari() |
| browser=webdriver.Edge() |
官网:http://selenium-python.readthedocs.io
selenium+chromedriver
| |
| pip3 install selenium |
| 下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.38,并非2.9 |
| 国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/ |
| 最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads |
| |
| |
| C:\Users\Administrator>python3 |
| Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 |
| Type "help", "copyright", "credits" or "license" for more information. |
| >>> from selenium import webdriver |
| >>> driver=webdriver.Chrome() |
| >>> driver.get('https://www.baidu.com') |
| >>> driver.page_source |
| |
| |
| selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver |
| 下载链接:https://github.com/mozilla/geckodriver/releases |
PhantomJS不再更新
selenium+phantomjs
| |
| pip3 install selenium |
| 下载phantomjs,解压后把phantomjs.exe所在的bin目录放到环境变量 |
| 下载链接:http://phantomjs.org/download.html |
| |
| |
| C:\Users\Administrator>phantomjs |
| phantomjs> console.log('egon gaga') |
| egon gaga |
| undefined |
| phantomjs> ^C |
| C:\Users\Administrator>python3 |
| Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 |
| Type "help", "copyright", "credits" or "license" for more information. |
| >>> from selenium import webdriver |
| >>> driver=webdriver.PhantomJS() |
| >>> driver.get('https://www.baidu.com') |
| >>> driver.page_source |
| 在 PhantomJS 年久失修, 后继无人的节骨眼 |
| Chrome 出来救场, 再次成为了反爬虫 Team 的噩梦 |
| 自Google 发布 chrome 59 / 60 正式版 开始便支持Headless mode |
| 这意味着在无 GUI 环境下, PhantomJS 不再是唯一选择 |
selenium+谷歌浏览器headless模式
| |
| |
| |
| |
| from selenium import webdriver |
| from selenium.webdriver.chrome.options import Options |
| chrome_options = Options() |
| chrome_options.add_argument('window-size=1920x3000') |
| chrome_options.add_argument('--disable-gpu') |
| chrome_options.add_argument('--hide-scrollbars') |
| chrome_options.add_argument('blink-settings=imagesEnabled=false') |
| chrome_options.add_argument('--headless') |
| chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" |
| |
| driver=webdriver.Chrome(chrome_options=chrome_options) |
| driver.get('https://www.baidu.com') |
| |
| print('hao123' in driver.page_source) |
| |
| driver.close() |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| try: |
| browser.get('https://www.baidu.com') |
input_tag=browser.find_element_by_id(‘kw’)
input_tag.send_keys(‘美女’) #python2中输入中文错误,字符串前加个u
input_tag.send_keys(Keys.ENTER) #输入回车
| |
| wait=WebDriverWait(browser,10) |
| wait.until(EC.presence_of_element_located((By.ID,'content_left'))) |
print(browser.page_source)
print(browser.current_url)
print(browser.get_cookies())
| |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| import time |
| |
| driver=webdriver.Chrome() |
| driver.get('https://www.baidu.com') |
| wait=WebDriverWait(driver,10) |
| |
| try: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| print(driver.find_element_by_id('kw')) |
| |
| |
| |
| |
| |
| |
| login=driver.find_elements_by_partial_link_text('录')[0] |
| login.click() |
| |
| |
| print(driver.find_element_by_tag_name('a')) |
| |
| |
| button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin'))) |
| button.click() |
| |
| |
| input_user=wait.until(EC.presence_of_element_located((By.NAME,'userName'))) |
| input_pwd=wait.until(EC.presence_of_element_located((By.NAME,'password'))) |
| commit=wait.until(EC.element_to_be_clickable((By.ID,'TANGRAM__PSP_10__submit'))) |
| |
| input_user.send_keys('18611453110') |
| input_pwd.send_keys('xxxxxx') |
| commit.click() |
| |
| |
| driver.find_element_by_css_selector('#kw') |
| |
| |
| |
| time.sleep(5) |
| |
| finally: |
| driver.close() |
| |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| import time |
| |
| driver=webdriver.PhantomJS() |
| driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') |
| |
| driver.implicitly_wait(3) |
| |
| try: |
| |
| |
| |
| |
| driver.find_element_by_xpath('//body//a') |
| driver.find_element_by_css_selector('body a') |
| |
| |
| res1=driver.find_elements_by_xpath('//body//a[1]') |
| print(res1[0].text) |
| |
| |
| res1=driver.find_element_by_xpath('//a[5]') |
| res2=driver.find_element_by_xpath('//a[@href="image5.html"]') |
| res3=driver.find_element_by_xpath('//a[contains(@href,"image5")]') |
| print('==>', res1.text) |
| print('==>',res2.text) |
| print('==>',res3.text) |
| |
| |
| res1=driver.find_element_by_xpath('/html/body/div/a') |
| print(res1.text) |
| |
| res2=driver.find_element_by_xpath('//a[img/@src="image3_thumb.jpg"]') |
| print(res2.tag_name,res2.text) |
| |
| res3 = driver.find_element_by_xpath("//input[@name='continue'][@type='button']") |
| res4 = driver.find_element_by_xpath("//*[@name='continue'][@type='button']") |
| |
| time.sleep(5) |
| |
| finally: |
| driver.close() |
详解
| doc=''' |
| <html> |
| <head> |
| <base href='http://example.com/' /> |
| <title>Example website</title> |
| </head> |
| <body> |
| <div id='images'> |
| <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> |
| <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> |
| <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> |
| <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> |
| <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> |
| <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> |
| </div> |
| </body> |
| </html> |
| ''' |
| from lxml import etree |
| |
| html=etree.HTML(doc) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| |
| browser.get('https://www.amazon.cn/') |
| |
| wait=WebDriverWait(browser,10) |
| wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer'))) |
| |
| tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img') |
| |
| |
| print(tag.get_attribute('src')) |
| |
| |
| print(tag.id) |
| print(tag.location) |
| print(tag.tag_name) |
| print(tag.size) |
| |
| browser.close() |
| #1、selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js),一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待 |
| |
| #2、等待的方式分两种: |
| 隐式等待:在browser.get('xxx')前就设置,针对所有元素有效 |
| 显式等待:在browser.get('xxx')之后设置,只针对某个元素有效 |
隐式等待
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| |
| |
| browser.implicitly_wait(10) |
| |
| browser.get('https://www.baidu.com') |
| |
| input_tag=browser.find_element_by_id('kw') |
| input_tag.send_keys('美女') |
| input_tag.send_keys(Keys.ENTER) |
| |
| contents=browser.find_element_by_id('content_left') |
| print(contents) |
| |
| browser.close() |
显式等待
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| browser.get('https://www.baidu.com') |
| |
| input_tag=browser.find_element_by_id('kw') |
| input_tag.send_keys('美女') |
| input_tag.send_keys(Keys.ENTER) |
| |
| |
| wait=WebDriverWait(browser,10) |
| wait.until(EC.presence_of_element_located((By.ID,'content_left'))) |
| |
| contents=browser.find_element(By.CSS_SELECTOR,'#content_left') |
| print(contents) |
| |
| browser.close() |
点击,清空
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| browser.get('https://www.amazon.cn/') |
| wait=WebDriverWait(browser,10) |
| |
| input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox'))) |
| input_tag.send_keys('iphone 8') |
| button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') |
| button.click() |
| |
| import time |
| time.sleep(3) |
| |
| input_tag=browser.find_element_by_id('twotabsearchtextbox') |
| input_tag.clear() |
| input_tag.send_keys('iphone7plus') |
| button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') |
| button.click() |
| |
| |
Action Chains
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| import time |
| |
| driver = webdriver.Chrome() |
| driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') |
| wait=WebDriverWait(driver,3) |
| |
| |
| try: |
| driver.switch_to.frame('iframeResult') |
| sourse=driver.find_element_by_id('draggable') |
| target=driver.find_element_by_id('droppable') |
| |
| |
| |
| |
| |
| |
| |
| |
| ActionChains(driver).click_and_hold(sourse).perform() |
| distance=target.location['x']-sourse.location['x'] |
| |
| track=0 |
| while track < distance: |
| ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform() |
| track+=2 |
| |
| ActionChains(driver).release().perform() |
| |
| time.sleep(10) |
| |
| finally: |
| driver.close() |
在交互动作比较难实现的时候可以自己写JS(万能方法)
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| try: |
| browser=webdriver.Chrome() |
| browser.get('https://www.baidu.com') |
| browser.execute_script('alert("hello world")') |
| finally: |
| browser.close() |
补充:frame的切换
| |
| |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| try: |
| browser=webdriver.Chrome() |
| browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') |
browser.switch_to.frame(‘iframeResult’) #切换到id为iframeResult的frame
| |
| tag1=browser.find_element_by_id('droppable') |
| print(tag1) |
| |
| |
| browser.switch_to.parent_frame() |
| tag2=browser.find_element_by_id('textareaCode') |
| print(tag2) |
| |
| finally: |
| browser.close() |
模拟浏览器的前进后退
| #模拟浏览器的前进后退 |
| import time |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome() |
| browser.get('https: |
| browser.get('https: |
| browser.get('http: |
| |
| browser.back() |
| time.sleep(10) |
| browser.forward() |
| browser.close() |
cookies
| |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome() |
| browser.get('https://www.zhihu.com/explore') |
| print(browser.get_cookies()) |
| browser.add_cookie({'k1':'xxx','k2':'yyy'}) |
| print(browser.get_cookies()) |
| |
| |
选项卡管理
| #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式 |
| import time |
| from selenium import webdriver |
| |
| browser=webdriver.Chrome() |
| browser.get('https: |
| browser.execute_script('window.open()') |
| |
| print(browser.window_handles) #获取所有的选项卡 |
| browser.switch_to_window(browser.window_handles[1]) |
| browser.get('https: |
| time.sleep(10) |
| browser.switch_to_window(browser.window_handles[0]) |
| browser.get('https: |
| browser.close() |
异常处理
| from selenium import webdriver |
| from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException |
| |
| try: |
| browser=webdriver.Chrome() |
| browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') |
| browser.switch_to.frame('iframssseResult') |
| |
| except TimeoutException as e: |
| print(e) |
| except NoSuchFrameException as e: |
| print(e) |
| finally: |
| browser.close() |
自动登录163邮箱并发送邮件
| |
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| |
| browser=webdriver.Chrome() |
| |
| try: |
| browser.get('http://mail.163.com/') |
| |
| wait=WebDriverWait(browser,5) |
| |
| frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe'))) |
| browser.switch_to.frame(frame) |
| |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container'))) |
| |
| inp_user=browser.find_element_by_name('email') |
| inp_pwd=browser.find_element_by_name('password') |
| button=browser.find_element_by_id('dologin') |
| inp_user.send_keys('18611453110') |
| inp_pwd.send_keys('xxxx') |
| button.click() |
| |
| |
| |
| |
| |
| |
| |
| wait.until(EC.presence_of_element_located((By.ID,'dvNavTop'))) |
| write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] |
| write_msg.click() |
| |
| wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0'))) |
| recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt') |
| title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input') |
| recv_man.send_keys('378533872@qq.com') |
| title.send_keys('圣旨') |
| print(title.tag_name) |
| |
| frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe'))) |
| browser.switch_to.frame(frame) |
| body=browser.find_element(By.CSS_SELECTOR,'body') |
| body.send_keys('egon很帅,可以加工资了') |
| |
| browser.switch_to.parent_frame() |
| send_button=browser.find_element_by_class_name('nui-toolbar-item') |
| send_button.click() |
| |
| |
| import time |
| time.sleep(10000) |
| |
| except Exception as e: |
| print(e) |
| finally: |
| browser.close() |
爬取京东商城商品信息
| from selenium import webdriver |
| from selenium.webdriver import ActionChains |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.common.keys import Keys |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| import time |
| |
| def get_goods(driver): |
| try: |
| goods=driver.find_elements_by_class_name('gl-item') |
| |
| for good in goods: |
| detail_url=good.find_element_by_tag_name('a').get_attribute('href') |
| |
| p_name=good.find_element_by_css_selector('.p-name em').text.replace('\n','') |
| price=good.find_element_by_css_selector('.p-price i').text |
| p_commit=good.find_element_by_css_selector('.p-commit a').text |
| |
| msg = ''' |
| 商品 : %s |
| 链接 : %s |
| 价钱 :%s |
| 评论 :%s |
| ''' % (p_name,detail_url,price,p_commit) |
| |
| print(msg,end='\n\n') |
| |
| button=driver.find_element_by_partial_link_text('下一页') |
| button.click() |
| time.sleep(1) |
| get_goods(driver) |
| except Exception: |
| pass |
| |
| def spider(url,keyword): |
| driver = webdriver.Chrome() |
| driver.get(url) |
| driver.implicitly_wait(3) |
| try: |
| input_tag=driver.find_element_by_id('key') |
| input_tag.send_keys(keyword) |
| input_tag.send_keys(Keys.ENTER) |
| get_goods(driver) |
| finally: |
| driver.close() |
| |
| if __name__ == '__main__': |
| spider('https://www.jd.com/',keyword='iPhone8手机') |
作业:
| 爬取亚马逊iphone手机的商品信息 |
| 爬取天猫python书籍的商品信息 |
| 爬取京东小米手机的商品信息 |