defParser(soup): raw = soup.find_all("a",class_='ulink') ptn = re.compile(r"《[\u4e00-\u9fa5]*》") movies = [] urls = [] for each in raw: movies.extend(ptn.findall(each.string)) urls.append(each["href"]) full_urls = [] for url in urls: full_urls.append(parse.urljoin(root_url,url)) return movies,full_urls
defSubHtml(urls): magnet = [] for url in urls: soup = GetHtml(url) if soup == None: continue get = soup.find("a",href=re.compile(r'magnet(.*?\.mp4)?')) if get != None: magnet.append(get.string) print(get.string) return magnet
import re from sre_constants import AT from bs4 import BeautifulSoup from selenium import webdriver import requests from urllib import parse import time from requests.adapters import HTTPAdapter from fake_useragent import UserAgent
defGetHtml(url): ua = UserAgent() headers={ 'user-agent':str(ua.random), //使用随机UA,防止被服务器拒绝 'Connection':'close' }
defParser(soup): raw = soup.find_all("a",class_='ulink') #print(raw) ptn = re.compile(r"《[\u4e00-\u9fa5]*》") movies = [] urls = [] for each in raw: movies.extend(ptn.findall(each.string)) urls.append(each["href"]) #print(movies) #print(urls) full_urls = [] for url in urls: full_urls.append(parse.urljoin(root_url,url)) return movies,full_urls
defSave(movies,urls,magnets): withopen(r"D:\AAA桌面映射文档\抢课app\爬虫\text.txt","a+",encoding='utf-8') as fp: for m,u,magnet inzip(movies,urls,magnets): fp.write(m+'\n') fp.write(u+'\n') fp.write(magnet+'\n')
defSubHtml(urls): magnet = [] for url in urls: soup = GetHtml(url) if soup == None: continue #print(soup) get = soup.find("a",href=re.compile(r'magnet(.*?\.mp4)?')) if get != None: magnet.append(get.string) print(get.string) #print(get.string) return magnet