使用selenium抓取华尔街见闻和新浪财经数据


# 新浪财经数据采集

import requests
import pymongo
import time

from selenium import webdriver
from bs4 import BeautifulSoup
# from fake_useragent import UserAgent
# ua_list = UserAgent()
ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'


def get_hej_news():
    """爬取华尔街见闻宏观新闻"""
    client = pymongo.MongoClient('localhost', 27017)
    news = client['news']
    hej_news = news['hej_news']
    chromedriver = r"/usr/local/share/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    # 使用get()方法打开待抓取的URL
    driver.get('https://wallstreetcn.com/live/global')
    # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep ：间歇滚动间距
    js = 'window.scrollBy(0,3000)'
    driver.execute_script(js)
    time.sleep(5)
    js = 'window.scrollBy(0,5000)'
    driver.execute_script(js)
    time.sleep(5)
    pages = driver.page_source
    soup = BeautifulSoup(pages, 'html.parser')
    soup1 = soup.find('div', class_='livenews')
    content = soup1.find_all('div', class_='live-item')

    for i in content:
        new_time = i.find('span', attrs={'class': 'live-item__time__text'}).get_text(),
        news = i.find('div', attrs={'class': 'content-html'}).get_text().strip().replace('\n|//', '')
        isexit = hej_news.count({'new_time': new_time})
        if isexit != 0:
            hej_news.remove({'new_time': new_time})
        data = {
            'new_time': new_time,
            'news': news
        }
        hej_news.insert_one(data)

    driver.close()
    driver.quit()
    print('存储华尔街见闻宏观新闻成功')


def get_xlcj_news():
    """爬取新浪财经突发live板块新闻"""
    client = pymongo.MongoClient('localhost', 27017)
    news = client['news']
    xlcj_news = news['xlcj_news']

    num = 1
    while num < 7:
        chromedriver = r"/usr/local/share/chromedriver"
        driver = webdriver.Chrome(chromedriver)
        url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num)
        # 使用get()方法打开待抓取的URL
        driver.get(url)
        # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep ：间歇滚动间距
        js = 'window.scrollBy(0,3000)'
        driver.execute_script(js)
        time.sleep(5)
        js = 'window.scrollBy(0,5000)'
        driver.execute_script(js)
        time.sleep(5)
        pages = driver.page_source
        soup = BeautifulSoup(pages, 'html.parser')
        soup1 = soup.find('div', class_='bd_list')
        content = soup1.find_all('div', class_='bd_i_og')
        num += 1
        for i in content:
            news_time = i.find('p', attrs={'class': 'bd_i_time_c'}).get_text().strip()
            news_type = i.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "")
            news = i.find('p', attrs={'class': 'bd_i_txt_c'}).get_text()
            print(news_time,news_type,news)

            isexit = xlcj_news.count({'news_time': news_time})
            if isexit != 0:
                xlcj_news.remove({'news_time': news_time})
            data = {
                'news_time': news_time,
                'news_type': news_type,
                'news': news
            }
            xlcj_news.insert_one(data)
        driver.close()
        driver.quit()
    print('新浪财经突发live板块新闻存储成功')


def main():
    # his_time = input('请输入要查询的新闻时间(格式：2017-11-2 00:00:00)：')
    # history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '')
    get_hej_news()
    get_xlcj_news()


if __name__ == '__main__':
    main()

复制代码

本文链接：https://blog.csdn.net/weixin_34025151/article/details/88007266

智能推荐

python数据抓取和提取- request/selenium+bs4-lxml解析器的使用

Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器，如果我们不安装它，则 Python 会使用 Python默认的解析器。lxml 解析器更加强大，速度更快，推荐安装。抓取网页爬取之前, 还需要查看判断爬取到的网页是动态加载还是静态加载, 如果是动态加载还需进一步处理浏览器（Chrome、Firefox）调试器判断法；用浏览器（以Chrome为例...

selenium动态抓取数据

动态网页数据抓取 Ajax(Asynchronouse JavaScript And XML)异步JavaScript和XML。在后台与服务器进行少量数据交换，Ajax可以使网页实现异步更新，意味着可以在不重新加载整个网页的情况下，对网页的某部分进行更新。传统的网页（不使用Ajax）如果需要更新内容，必须重新加载整个页面。过去网页在传输数据格式方面，使用的是XML语法。因此叫做Ajax。现在数据交...

解决Spyder无法抓取Yahoo！finance财经数据

问题描述：用Spyder在Yahoo！finance上抓取财经数据时，处于无反应状态，无法抓取所需的财经数据。主要是因为Yahoo！finance停用了它的历史数据API，所以为了能继续抓取数据，需要安装yfinance（yfinance的官网），这个就是专门为了解决无法从Yahoo！finance抓取数据而开发出来的。解决方法： 1、使用pip安装yfinance pip install yf...

selenium+firefox模拟下滑抓取新浪新闻

to Out[2]: ['台湾花莲地震已致2名陆客受伤其中1人伤势较重\n2月7日 11:26\n评论(2)|分享', '法制日报:旅游业长期被诟病的地区要看看是否涉黑\n2月7日 11:12\n评论(1)|分享', 'Close\n小伙伴们都在看什么新闻？登录微博你就知道！登录', '大陆为花莲跨海送暖的信号连岛内绿媒都接收到了\n2月7日 ...

网络爬虫：使用Selenium绕过登录抓取知乎数据

今天研究了下网络爬虫，有不少这方面的文章，开始找到的是用HttpRequest进行抓取，但是这种抓取对某些网站显然是不行的。比如知乎，要抓取信息必须先登录。又搜索这方面的内容，网上信息繁杂且混乱，而且关于C#方面的内容十分的少。在研究了很久，尝试了更久之后，终于初步实现了这一功能，代码位置：https://codechina.csdn.net/wjwlsyd/netcrawler/-/commi...

代码先锋网代码片段及技术文章聚合