欢迎光临UUpython
最大、最新、最全的Python代码收集站

获取人人电影网的链接

这段代码是一个网络爬虫,用于从指定网站中获取电影、电视剧等资源的链接,并将链接保存为JSON文件。以下是代码的解释和完整版本:

需要注意以下几点:

  1. 代码中使用了requests库来发送HTTP请求,并使用BeautifulSoup库来解析HTML内容。这些库在网页内容处理中非常常用。
  2. USER_AGENTS列表存储了不同的User Agent字符串,用于模拟不同的浏览器请求。
  3. get_target_links函数用于从网站中获取目标链接,save_to_json函数用于将链接保存为JSON文件。
  4. 主程序中,用户可以选择不同的类别来获取相应的链接。同时,程序会保存断点信息,以便在下次运行时继续。
  5. 代码中加入了随机暂停,以模拟人为浏览行为,以避免对服务器造成过大压力。
  6. 请确保你的爬取行为是合法合规的,并且遵循网站的使用规则。使用时请注意适当修改相关参数以满足你的需求。
import requests
import random
import time
import json
import logging
import os
from bs4 import BeautifulSoup
from datetime import datetime
 
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 Edge/16.16299",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    # 可根据需要添加更多的User Agent
]
 
# 设置日志模块使用utf-8编码
log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'crawler.log')
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', encoding='utf-8')
 
def get_target_links(category, listnum, start_page=1):
    category_list = [
        ("dongman", "13"),
        ("movie", "2"),
        ("dianshiju", "6"),
        ("zongyi", "10"),
    ]
 
    # Check if the provided category and listnum are within valid range
    if not (1 <= category <= len(category_list)):
        logging.warning("无效的选项。")
        return []
 
    if not (1 <= listnum <= len(category_list)):
        logging.warning("无效的选择。")
        return []
 
    # Get the selected category and listnum
    selected_category, selected_listnum = category_list[category - 1]
 
    base_url = f"https://www.rrdynb.com/{selected_category}/list_{selected_listnum}_"
    page_number = start_page
    target_links = []
 
    while True:
        url = f"{base_url}{page_number}.html"
 
        # 随机选择一个User Agent
        user_agent = random.choice(USER_AGENTS)
        headers = {"User-Agent": user_agent}
 
        logging.info(f"正在获取第{page_number}页...")
        response = requests.get(url, headers=headers)
 
        if response.status_code == 200:
            logging.info(f"成功连接,正在抓取...")
            soup = BeautifulSoup(response.text, "html.parser")
            target_link = soup.find("a", class_="movie-thumbnails")["href"]
            target_links.append(target_link)
            page_number += 1
            time.sleep(3)  # 暂停
        elif response.status_code == 404:
            logging.info(f"已获取所有页面,共{page_number - 1}页。")
            break
        else:
            logging.warning(f"获取第{page_number}页失败。5秒后重试...")
            time.sleep(5)  # 5秒后重试
            continue
 
    return target_links
 
def save_to_json(target_links, category):
    categories = {
        1: "动漫",
        2: "电影",
        3: "电视剧",
        4: "老电影",
    }
 
    if category not in categories:
        logging.warning("无效的选项。")
        return
 
    today = datetime.now().strftime("%Y%m%d")
    file_name = f"{categories[category]}_{today}.json"
    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)
    with open(file_path, "w", encoding="utf-8") as json_file:
        json.dump(target_links, json_file, ensure_ascii=False, indent=4)
 
if __name__ == "__main__":
    print("1. 动漫")
    print("2. 电影")
    print("3. 电视剧")
    print("4. 老电影")
 
    selected_category = int(input("请选择一个选项:"))
 
    logging.info(f"用户选择了选项 {selected_category}")
 
    checkpoint_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'checkpoint.txt')
    try:
        with open(checkpoint_file_path, 'r') as checkpoint_file:
            start_page = int(checkpoint_file.readline())
    except FileNotFoundError:
        start_page = 1
 
    target_links = get_target_links(selected_category, start_page)
    if target_links:
        today = datetime.now().strftime("%Y%m%d")
        with open(checkpoint_file_path, 'w') as checkpoint_file:
            checkpoint_file.write(f"{len(target_links) + start_page - 1}_{today}")
            logging.info("断点已保存。")
 
        logging.info("目标链接列表:")
        for link in target_links:
            logging.info(link)
 
            # 保存当前页的链接至JSON文件并添加https://www.rrdynb.com前缀
            current_page_links = [f"https://www.rrdynb.com{link}" for link in target_links]
            save_to_json(current_page_links, selected_category)
            logging.info("当前页链接已保存至对应的JSON文件。")
 
        logging.info(f"{len(target_links)}页获取成功已保存到JSON文件当中。")
    else:
        logging.warning("未找到目标链接。")
赞(0) 打赏
未经允许不得转载:UUpython » 获取人人电影网的链接
分享到: 更多 (0)

评论 抢沙发

评论前必须登录!