python:简单爬取自己的一篇博客文章

2023-06-25,,

1、爬取文章地址:https://www.cnblogs.com/Mr-choa/p/12495157.html

爬取文章的标题、具体内容,保存到文章名.txt

代码如下:

# 导入requests模块
import requests
import urllib.request
# 导入re模块
import re
# 爬取地址
url='https://www.cnblogs.com/Mr-choa/p/12495157.html'
# 创建网页响应的对象
response=requests.get(url)
# 获取整个网页的内容
html_page=response.text
# 创建一个字符串标签
title_pattern=r'(<a.*id="cb_post_title_url".*>)(.*)(</a>)'
# 按照标签,匹配相应的数据
title_match=re.search(title_pattern,html_page)
# 获取标题
title=title_match.group(2)
# 打印标题
print(title)
# 导入bs4库的BeautifulSoup
from bs4 import BeautifulSoup
# 创建对象,基于bs4库HTML的格式输出
soup=BeautifulSoup(html_page,'html.parser')
#取出soup中所有的进行prettify()方法处理的数据
print(soup.prettify())
# 取出soup中的a标签进行prettify()方法处理的数据
print(soup.a.prettify())
# 定义一个soup进行find()方法处理的标签
div=soup.find(id="cnblogs_post_body")
# 取出博客文章内容
print(div.text)
# 创建文件名
filename=title+'.txt'
# 在文件内添加数据
with open(filename,'w',encoding='utf-8') as f:
# 在文件中添加博客文章的标题
f.write(title)
# 在文件中添加博客文章的具体内容
f.write(div.text)

结果:

python实现杨辉三角形
<!DOCTYPE html>
<html lang="zh-cn">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="origin" name="referrer"/>
<meta content="代码实现: # python实现杨辉三角形 def yanghui(): # 定义第一行列表为[1] line = [1] while True: # yield的作用:把一个函数变成生成器,同时返回" property="og:description"/>
<meta content="no-transform" http-equiv="Cache-Control"/>
<meta content="no-siteapp" http-equiv="Cache-Control"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>
python实现杨辉三角形 - Mr_choa - 博客园
</title>
<link href="/css/blog-common.min.css?v=BKtyzabbeYJEVOaELkxmRjHbp7LT-v37GzrU5S24bJk" rel="stylesheet"/>
<link href="/skins/codinglife/bundle-codinglife-mobile.min.css?v=XjHfryC9ctsatIZz1dHn7TsRIqmwz9c3aj_6SCfHSDM" id="mobile-style" media="only screen and (max-width: 767px)" rel="stylesheet" type="text/css"/>
<link href="https://www.cnblogs.com/Mr-choa/rss" rel="alternate" type="application/rss+xml"/>
<link href="https://www.cnblogs.com/Mr-choa/rsd.xml" rel="EditURI" type="application/rsd+xml"/>
<link href="https://www.cnblogs.com/Mr-choa/wlwmanifest.xml" rel="wlwmanifest" type="application/wlwmanifest+xml"/>
<script src="https://common.cnblogs.com/scripts/jquery-2.2.0.min.js">
</script>
<script src="/js/blog-common.min.js?v=PjR5-TCjhk5LFAD2CMY5CO-o7uneBnLpPbqokBXsWnE">
</script>
<script>
var currentBlogId = 586285;
var currentBlogApp = 'Mr-choa';
var cb_enable_mathjax = false;
var isLogined = false;
var skinName = 'CodingLife';
</script>
</head>
<body>
<a name="top">
</a>
<div id="page_begin_html">
<!-- 自定制样式文件 -->
<link href="https://files.cnblogs.com/files/jingmoxukong/cnblog.min.css" rel="stylesheet"/>
<!-- 自定制脚本 -->
<!-- fork github 控件 -->
<a aria-label="View source on Github" class="github-corner" href="https://github.com/xusongxu">
<svg aria-hidden="true" height="" style="fill:#FD6C6C; color:#fff; position: absolute; top: 0; border: 0; right: 0;" viewbox="0 0 250 250" width="">
<path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z">
</path>
<path class="octo-arm" d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;">
</path>
<path class="octo-body" d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor">
</path>
</svg>
</a>
<style>
.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}
</style>
</div>
<!--done-->
<div id="home">
<div id="header">
<div id="blogTitle">
<a href="https://www.cnblogs.com/Mr-choa/" id="lnkBlogLogo">
<img alt="返回主页" id="blogLogo" src="/skins/custom/images/logo.gif"/>
</a>
<!--done-->
<h1>
<a class="headermaintitle HeaderMainTitle" href="https://www.cnblogs.com/Mr-choa/" id="Header1_HeaderTitle">
Mr_choa
</a>
</h1>
<h2>
</h2>
</div>
<!--end: blogTitle 博客的标题和副标题 -->
<div id="navigator">
<ul id="navList">
<li>
<a class="menu" href="https://www.cnblogs.com/" id="blog_nav_sitehome">
博客园
</a>
</li>
<li>
<a class="menu" href="https://www.cnblogs.com/Mr-choa/" id="blog_nav_myhome">
首页
</a>
</li>
<li>
<a class="menu" href="https://i.cnblogs.com/EditPosts.aspx?opt=1" id="blog_nav_newpost">
新随笔
</a>
</li>
<li>
</li>
<li>
<a class="menu" href="https://www.cnblogs.com/Mr-choa/rss/" id="blog_nav_rss">
订阅
</a>
<!--<partial name="./Shared/_XmlLink.cshtml" model="Model" /></li>-->
</li>
<li>
<a class="menu" href="https://i.cnblogs.com/" id="blog_nav_admin">
管理
</a>
</li>
</ul>
<div class="blogStats">
<span id="stats_post_count">
随笔 -
17
</span>
<span id="stats_article_count">
文章 -
0
</span>
<span id="stats-comment_count">
评论 -
0
</span>
</div>
<!--end: blogStats -->
</div>
<!--end: navigator 博客导航栏 -->
</div>
<!--end: header 头部 -->
<div id="main">
<div id="mainContent">
<div class="forFlow">
<div id="post_detail">
<!--done-->
<div id="topics">
<div class="post">
<h1 class="postTitle">
<a class="postTitle2" href="https://www.cnblogs.com/Mr-choa/p/12495157.html" id="cb_post_title_url">
python实现杨辉三角形
</a>
</h1>
<div class="clear">
</div>
<div class="postBody">
<div class="blogpost-body" id="cnblogs_post_body">
<pre><br/><strong><span style="font-size: 14px;">代码实现:</span></strong></pre>
<div class="cnblogs_code">
<pre><span style="font-size: 12px;"><span style="color: #008000;">#</span><span style="color: #008000;"> python实现杨辉三角形</span>
<span style="color: #0000ff;">def</span><span style="color: #000000;"> yanghui():
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 定义第一行列表为[1]</span>
line = [1<span style="color: #000000;">]
</span><span style="color: #0000ff;">while</span><span style="color: #000000;"> True:
</span><span style="color: #008000;">#</span><span style="color: #008000;"> yield的作用:把一个函数变成生成器,同时返回一个list,下次从yield的下条语句执行</span>
<span style="color: #0000ff;">yield</span><span style="color: #000000;"> line
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 设上一个为[1],通过式子可得[1,1],继而[1,2,1]......</span>
line = [1] + [line[i] + line[i + 1] <span style="color: #0000ff;">for</span> i <span style="color: #0000ff;">in</span> range(len(line) - 1)] + [1<span style="color: #000000;">] </span><span style="color: #008000;">#</span><span style="color: #008000;"> 输入杨辉三角形的行数</span>
n = int(input(<span style="color: #800000;">"</span><span style="color: #800000;">请输入行数:</span><span style="color: #800000;">"</span><span style="color: #000000;">))
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 定义一个结束的变量</span>
flag =<span style="color: #000000;"> 0
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 生成器可迭代,做个遍历</span>
<span style="color: #0000ff;">for</span> i <span style="color: #0000ff;">in</span><span style="color: #000000;"> yanghui():
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 打印每行的列表的元素,用空格连接</span>
<span style="color: #0000ff;">print</span>(<span style="color: #800000;">"</span> <span style="color: #800000;">"</span>.join(str(j) <span style="color: #0000ff;">for</span> j <span style="color: #0000ff;">in</span><span style="color: #000000;"> i))
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 打印完一行,flag+1</span>
flag += 1
<span style="color: #008000;">#</span><span style="color: #008000;"> 如果变量flag等于输入的行数,跳出for循环</span>
<span style="color: #0000ff;">if</span> flag ==<span style="color: #000000;"> n:
</span><span style="color: #008000;">#</span><span style="color: #008000;"> 跳出循环</span>
<span style="color: #0000ff;">break</span></span></pre>
</div>
<pre><br/><strong><span style="font-size: 14px;">效果:</span></strong></pre>
<p>
<span style="font-size: 14px;">
请输入行数:6
</span>
<br/>
<span style="font-size: 14px;">
1
</span>
<br/>
<span style="font-size: 14px;">
1 1
</span>
<br/>
<span style="font-size: 14px;">
1 2 1
</span>
<br/>
<span style="font-size: 14px;">
1 3 3 1
</span>
<br/>
<span style="font-size: 14px;">
1 4 6 4 1
</span>
<br/>
<span style="font-size: 14px;">
1 5 10 10 5 1
</span>
</p>
<p>
<span style="font-size: 14px;">
Process finished with exit code 0
</span>
</p>
<pre></pre>
</div>
<div id="MySignature">
</div>
<div class="clear">
</div>
<div id="blog_post_info_block">
<div id="blog_post_info">
</div>
<div class="clear">
</div>
<div id="post_next_prev">
</div>
</div>
</div>
<div class="postDesc">
posted @
<span id="post-date">
2020-03-14 23:23
</span>
<a href="https://www.cnblogs.com/Mr-choa/">
Mr_choa
</a>
阅读(
<span id="post_view_count">
...
</span>
)
评论(
<span id="post_comment_count">
...
</span>
)
<a href="https://i.cnblogs.com/EditPosts.aspx?postid=12495157" rel="nofollow">
编辑
</a>
<a href="javascript:void(0)" onclick="AddToWz(12495157);return false;">
收藏
</a>
</div>
</div>
</div>
<!--end: topics 文章、评论容器-->
</div>
<script src="https://common.cnblogs.com/highlight/9.12.0/highlight.min.js">
</script>
<script>
markdown_highlight();
</script>
<script>
var allowComments = true, cb_blogId = 586285, cb_blogApp = 'Mr-choa', cb_blogUserGuid = '942ed5be-58c0-4ddb-c560-08d7c7fab50e';
var cb_entryId = 12495157, cb_entryCreatedDate = '2020-03-14 23:23', cb_postType = 1;
loadViewCount(cb_entryId);
</script>
<a name="!comments">
</a>
<div id="blog-comments-placeholder">
</div>
<script>
var commentManager = new blogCommentManager();
commentManager.renderComments(0);
</script>
<div class="commentform" id="comment_form">
<a name="commentform">
</a>
<div id="divCommentShow">
</div>
<div id="comment_nav">
<span id="span_refresh_tips">
</span>
<a clientidmode="Static" href="javascript:void(0);" id="lnk_RefreshComments" onclick="return RefreshCommentList();" runat="server">
刷新评论
</a>
<a href="#" onclick="return RefreshPage();">
刷新页面
</a>
<a href="#top">
返回顶部
</a>
</div>
<div id="comment_form_container">
</div>
<div class="ad_text_commentbox" id="ad_text_under_commentbox">
</div>
<div id="ad_t2">
</div>
<div id="opt_under_post">
</div>
<script async="async" src="https://www.googletagservices.com/tag/js/gpt.js">
</script>
<script>
var googletag = googletag || {};
googletag.cmd = googletag.cmd || [];
</script>
<script>
googletag.cmd.push(function () {
googletag.defineSlot("/1090369/C1", [300, 250], "div-gpt-ad-1546353474406-0").addService(googletag.pubads());
googletag.defineSlot("/1090369/C2", [468, 60], "div-gpt-ad-1539008685004-0").addService(googletag.pubads());
googletag.pubads().enableSingleRequest();
googletag.enableServices();
});
</script>
<div class="c_ad_block" id="cnblogs_c1">
<div id="div-gpt-ad-1546353474406-0" style="height:250px; width:300px;">
</div>
</div>
<div id="under_post_news">
</div>
<div class="c_ad_block" id="cnblogs_c2">
<div id="div-gpt-ad-1539008685004-0" style="height:60px; width:468px;">
<script>
if (new Date() >= new Date(2018, 9, 13)) {
googletag.cmd.push(function () { googletag.display("div-gpt-ad-1539008685004-0"); });
}
</script>
</div>
</div>
<div id="under_post_kb">
</div>
<div class="c_ad_block" id="HistoryToday">
</div>
<script type="text/javascript">
fixPostBody();
deliverBigBanner();
setTimeout(function() { incrementViewCount(cb_entryId); }, 50); deliverAdT2();
deliverAdC1();
deliverAdC2();
loadNewsAndKb();
loadBlogSignature();
LoadPostCategoriesTags(cb_blogId, cb_entryId); LoadPostInfoBlock(cb_blogId, cb_entryId, cb_blogApp, cb_blogUserGuid);
GetPrevNextPost(cb_entryId, cb_blogId, cb_entryCreatedDate, cb_postType);
loadOptUnderPost();
GetHistoryToday(cb_blogId, cb_blogApp, cb_entryCreatedDate);
</script>
</div>
</div>
<!--end: forFlow -->
</div>
<!--end: mainContent 主体内容容器-->
<div id="sideBar">
<div id="sideBarMain">
<div class="newsItem" id="sidebar_news">
<script>
loadBlogNews();
</script>
</div>
<div id="blog-calendar" style="display:none">
</div>
<script>
loadBlogDefaultCalendar();
</script>
<div id="leftcontentcontainer">
<div id="blog-sidecolumn">
</div>
<script>
loadBlogSideColumn();
</script>
</div>
</div>
<!--end: sideBarMain -->
</div>
<!--end: sideBar 侧边栏容器 -->
<div class="clear">
</div>
</div>
<!--end: main -->
<div class="clear">
</div>
<div id="footer">
<!--done-->
Copyright © 2020 Mr_choa
<br/>
<span id="poweredby">
Powered by .NET Core on Kubernetes
</span>
</div>
<!--end: footer -->
</div>
<!--end: home 自定义的最大容器 -->
</body>
</html>
<a name="top">
</a> 代码实现: # python实现杨辉三角形
def yanghui():
# 定义第一行列表为[1]
line = [1]
while True:
# yield的作用:把一个函数变成生成器,同时返回一个list,下次从yield的下条语句执行
yield line
# 设上一个为[1],通过式子可得[1,1],继而[1,2,1]......
line = [1] + [line[i] + line[i + 1] for i in range(len(line) - 1)] + [1] # 输入杨辉三角形的行数
n = int(input("请输入行数:"))
# 定义一个结束的变量
flag = 0
# 生成器可迭代,做个遍历
for i in yanghui():
# 打印每行的列表的元素,用空格连接
print(" ".join(str(j) for j in i))
# 打印完一行,flag+1
flag += 1
# 如果变量flag等于输入的行数,跳出for循环
if flag == n:
# 跳出循环
break 效果:
请输入行数:611 11 2 11 3 3 11 4 6 4 11 5 10 10 5 1 Process finished with exit code 0

打开新建立的.txt文件:

使用正则表达式re模块将博客文章的标题提取出来

这里获取文章内容用的是bs4库的BeautifulSoup,可以让HTML页面更加有好的显示。我们已经得到了博客文章内容相关的HTML。现在还剩最后一步,就是把HTML标签去掉,保留文本内容。这一步,我们就可以利用HTMLParser来实现了

参考博客:https://www.cnblogs.com/xingzhui/p/7881262.html

python:简单爬取自己的一篇博客文章的相关教程结束。

《python:简单爬取自己的一篇博客文章.doc》

下载本文的Word格式文档,以方便收藏与打印。