一些爬虫 – 打开

# -*- coding: utf-8 -*-
'''
Created on Fri Apr 14 22:04:07 2017

@author: Administrator
'''
import urllib
from bs4 import BeautifulSoup

res = urllib.request.urlopen(“http://shijian.cc/jieri/”)
soup = BeautifulSoup(res,”lxml”)
book_div = soup.find(attrs={“class”:”col-md-6 col-sm-12″})
book_a = book_div.findAll(attrs={“class”:”primary-link”})
for book in book_a:
print(book.string)

输出了这个网页提供的节日的国家…好像没什么用，不过好歹是成功了

import requests

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头（大部分网站没有这个请求头会报错、请务必加上哦）
all_url = 'https://astr.ac.cn' ##开始的URL地址
start_html = requests.get(all_url,headers=headers) ##使用requests中的get方法来获取all_url(就是：http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
print(start_html.text)
##打印出start_html (请注意，concent是二进制的数据，一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
f=open(r't.txt','w')
f.write(str(start_html.text))
f.close()

保存网站的源代码

from bs4 import BeautifulSoup
import os,urllib
def taonvlang(url):
res=urllib.request.urlopen(url).read()
data=res.decode()
soup=BeautifulSoup(data,"lxml")
path="D:/taonvlang/"
if not os.path.isdir(path): #如果不存在该路径，则创建路径
os.makedirs(path)
count=1 #用于给图片编号
for list in soup.find_all("img"): #获取img的所有内容
print(list) #img标签的所有内容
dict=list.attrs #将该字段转换为字典
print(dict)
if "data-original" in dict:
image=dict["data-original"] #取图片地址
# print(image)
img=image[image.rfind(".")::] #取出文件扩展名
# print(img)
image_path=str(count).zfill(5)+img
filepath=os.path.join(path,image_path)
with open(filepath,"wb") as f:
image_data=urllib.request.urlopen(dict["data-original"]).read()
f.write(image_data)
count+=1

if __name__=="__main__":
url="http://www.mzitu.com"
taonvlang(url)

获取图片

Smilie Vote is loading.

“一些爬虫”下有一个评论：

打开君说道：

2018年12月24日下午8:18

import requests
from bs4 import BeautifulSoup
res=requests.get(“https://astr.ac.cn”)
res.encoding=”utf-8″
#print(res.text)
soup=BeautifulSoup(res.text)
#print(soup.select(“article”)[0])
for soup2 in soup.select(“article”):
title=soup2.select(“h2”)[0].text
time=soup2.select(“time”)[0].text
a=soup2.select(“a”)[0][“href”]
print(title,time,a)

回复

“一些爬虫”下有一个评论：

发表评论 取消回复

发表评论取消回复