怎么复制自己的微博ID（并写入文件）

首页教程更新时间：2023-06-10 20:53:44

# 输入 # 用户id，例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” # # 输出 # 用户名：用户昵称，如"Dear-迪丽热巴" # 微博数：用户的全部微博数（转发微博原创微博） # 关注数：用户关注的微博账号数量 # 粉丝数：用户的粉丝数 # 微博内容：以list的形式存储了用户所有微博内容 # 微博发布时间：以list的形式存储了用户所有微博的发布时间 # 微博对应的点赞数：以list的形式存储了用户所有微博对应的点赞数 # 微博对应的转发数：以list的形式存储了用户所有微博对应的转发数 # 微博对应的评论数：以list的形式存储了用户所有微博对应的评论数 # 结果文件：保存在当前目录的weibo文件夹里，名字为"user_id.txt"的形式 # 运行环境 # 开发语言：python2.7 # 系统： Windows/Linux # 使用说明 # 1.下载脚本 # # $ git clone https://github.com/dataabc/weibospider.git # 运行上述命令，将本项目下载到当前目录，如果下载成功当前目录会出现一个名为"weibospider"的文件夹； # 2.用文本编辑器打开weibospider文件夹下的"weibospider.py"文件； # 3.将"weibospider.py"文件中的“your cookie”替换成爬虫微博的cookie，后面会详细讲解如何获取cookie； # 4.将"weibospider.py"文件中的user_id替换成想要爬取的微博的user_id，后面会详细讲解如何获取user_id； # 5.按需求调用脚本。本脚本是一个Weibo类，用户可以按照自己的需求调用Weibo类。例如用户可以直接在"weibospider.py"文件中调用Weibo类，具体调用代码示例如下： # # user_id = 1669879400 # filter = 1 # wb = Weibo(user_id,filter) #调用Weibo类，创建微博实例wb # wb.start() #爬取微博信息 # user_id可以改成任意合法的用户id（爬虫的微博id除外）；filter默认值为0，表示爬取所有微博信息（转发微博原创微博），为1表示只爬取用户的所有原创微博；wb是Weibo类的一个实例，也可以是其它名字，只要符合python的命名规范即可；通过执行wb.start() 完成了微博的爬取工作。在上述代码之后，我们可以得到很多信息： # wb.username：用户名； # wb.weibo_num：微博数； # wb.following：关注数； # wb.followers：粉丝数； # wb.weibo_content：存储用户的所有微博，为list形式，若filter=1， wb.weibo_content[0]为最新一条原创微博，filter=0为最新一条微博，wb.weibo_content[1]、wb.weibo_content[2]分别表示第二新和第三新的微博，以此类推。当然如果用户没有发过微博，wb.weibo_content则为[]； # wb.publish_time: 存储微博的发布时间，为list形式，如wb.publish_time[0]为最新一条微博的发布时间，与wb.weibo_content[0]对应，其它用法同wb.weibo_content； # wb.up_num：存储微博获得的点赞数，为list形式，如wb.up_num[0]为最新一条微博获得的点赞数，与wb.weibo_content[0]对应，其它用法同wb.weibo_content； # wb.retweet_num：存储微博获得的转发数，为list形式，如wb.retweet_num[0]为最新一条微博获得的转发数，与wb.weibo_content[0]对应，其它用法同wb.weibo_content； # wb.comment_num：存储微博获得的评论数，为list形式，如wb.comment_num[0]为最新一条微博获得的评论数，与wb.weibo_content[0]对应，其它用法同wb.weibo_content。 # 6.运行脚本。我的运行环境是IPython,通过 # # $ run filepath/weibospider.py # 即可运行脚本，大家可以根据自己的运行环境选择运行方式； Linux可以通过 # # $ python filepath/weibospider.py # 如何获取cookie # 1.用Chrome打开https://passport.weibo.cn/signin/login； # 2.按F12键打开Chrome开发者工具； # 3.点开“Network”，将“Preserve log”选中，输入微博的用户名、密码，登录 # 4.点击Chrome开发者工具“Name"列表中的"m.weibo.cn",点击"Headers"，其中"Request Headers"下，"Cookie"后的值即为我们要找的cookie值，复制即可 # 如何获取user_id # 1.打开网址http://weibo.cn，搜索我们要找的人，如”郭碧婷“，进入她的主页； # 2.大部分情况下，在用户主页的地址栏里就包含了user_id，如”郭碧婷“的地址栏地址为"http://weibo.cn/u/1729370543?f=search_0"，其中的"1729370543"就是她的user_id。 # 注意事项 # 1.user_id不能为爬虫微博的user_id。因为要爬微博信息，必须先登录到某个微博账号，此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面，得到的网页格式不同，所以无法爬取自己的微博信息； # 2.cookie有期限限制，大约两天左右的有效期，超过有效期需重新更新cookie。

#!/usr/bin/env python # -*- coding: UTF-8 -*- import os import re import requests import sys import traceback from datetime import datetime from datetime import timedelta from lxml import etree class Weibo: cookie = {"Cookie": "your cookie"} # 将your cookie替换成自己的cookie # Weibo类初始化 def __init__(self, user_id, filter=0): self.user_id = user_id # 用户id，即需要我们输入的数字，如昵称为“Dear-迪丽热巴”的id为1669879400 self.filter = filter # 取值范围为0、1，程序默认值为0，代表要爬取用户的全部微博，1代表只爬取用户的原创微博 self.username = '' # 用户名，如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.weibo_num2 = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 self.weibo_content = [] # 微博内容 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 self.retweet_num = [] # 微博对应的转发数 self.comment_num = [] # 微博对应的评论数 # 获取用户昵称 def get_username(self): try: url = "https://weibo.cn/%d/info" % (self.user_id) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) username = selector.xpath("//title/text()")[0] self.username = username[:-3] print u"用户名: " self.username except Exception, e: print "Error: ", e traceback.print_exc() # 获取用户微博数、关注数、粉丝数 def get_user_info(self): try: url = "https://weibo.cn/u/%d?filter=%d&page=1" % ( self.user_id, self.filter) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) pattern = r"\d \.?\d*" # 微博数 str_wb = selector.xpath( "//div[@class='tip2']/span[@class='tc']/text()")[0] guid = re.findall(pattern, str_wb, re.S | re.M) for value in guid: num_wb = int(value) break self.weibo_num = num_wb print u"微博数: " str(self.weibo_num) # 关注数 str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] guid = re.findall(pattern, str_gz, re.M) self.following = int(guid[0]) print u"关注数: " str(self.following) # 粉丝数 str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] guid = re.findall(pattern, str_fs, re.M) self.followers = int(guid[0]) print u"粉丝数: " str(self.followers) except Exception, e: print "Error: ", e traceback.print_exc() # 获取"长微博"全部文字内容 def get_long_weibo(self, weibo_link): try: html = requests.get(weibo_link, cookies=self.cookie).content selector = etree.HTML(html) info = selector.xpath("//div[@class='c']")[1] wb_content = info.xpath("div/span[@class='ctt']")[0].xpath( "string(.)").encode(sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) return wb_content except Exception, e: print "Error: ", e traceback.print_exc() # 获取用户微博内容及对应的发布时间、点赞数、转发数、评论数 def get_weibo_info(self): try: url = "https://weibo.cn/u/%d?filter=%d&page=1" % ( self.user_id, self.filter) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)(selector.xpath( "//input[@name='mp']")[0].attrib["value"]) pattern = r"\d \.?\d*" for page in range(1, page_num 1): url2 = "https://weibo.cn/u/%d?filter=%d&page=%d" % ( self.user_id, self.filter, page) html2 = requests.get(url2, cookies=self.cookie).content selector2 = etree.HTML(html2) info = selector2.xpath("//div[@class='c']") is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): # 微博内容 str_t = info[i].xpath("div/span[@class='ctt']") weibo_content = str_t[0].xpath("string(.)").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) weibo_content = weibo_content[:-1] weibo_id = info[i].xpath("@id")[0][2:] a_link = info[i].xpath( "div/span[@class='ctt']/a/@href") if a_link: if a_link[-1] == "/comment/" weibo_id: weibo_link = "https://weibo.cn" a_link[-1] wb_content = self.get_long_weibo(weibo_link) if wb_content: weibo_content = wb_content self.weibo_content.append(weibo_content) print u"微博内容：" weibo_content # 微博发布时间 str_time = info[i].xpath("div/span[@class='ct']") str_time = str_time[0].xpath("string(.)").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) publish_time = str_time.split(u'来自')[0] if u"刚刚" in publish_time: publish_time = datetime.now().strftime( '%Y-%m-%d %H:%M') elif u"分钟" in publish_time: minute = publish_time[:publish_time.find(u"分钟")] minute = timedelta(minutes=int(minute)) publish_time = ( datetime.now() - minute).strftime( "%Y-%m-%d %H:%M") elif u"今天" in publish_time: today = datetime.now().strftime("%Y-%m-%d") time = publish_time[3:] publish_time = today " " time elif u"月" in publish_time: year = datetime.now().strftime("%Y") month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] publish_time = ( year "-" month "-" day " " time) else: publish_time = publish_time[:16] self.publish_time.append(publish_time) print u"微博发布时间：" publish_time str_footer = info[i].xpath("div")[-1] str_footer = str_footer.xpath("string(.)").encode( sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) str_footer = str_footer[str_footer.rfind(u'赞'):] guid = re.findall(pattern, str_footer, re.M) # 点赞数 up_num = int(guid[0]) self.up_num.append(up_num) print u"点赞数: " str(up_num) # 转发数 retweet_num = int(guid[1]) self.retweet_num.append(retweet_num) print u"转发数: " str(retweet_num) # 评论数 comment_num = int(guid[2]) self.comment_num.append(comment_num) print u"评论数: " str(comment_num) self.weibo_num2 = 1 if not self.filter: print u"共" str(self.weibo_num2) u"条微博" else: print (u"共" str(self.weibo_num) u"条微博，其中" str(self.weibo_num2) u"条为原创微博" ) except Exception, e: print "Error: ", e traceback.print_exc() # 将爬取的信息写入文件 def write_txt(self): try: if self.filter: result_header = u"\n\n原创微博内容：\n" else: result_header = u"\n\n微博内容：\n" result = (u"用户信息\n用户昵称：" self.username u"\n用户id：" str(self.user_id) u"\n微博数：" str(self.weibo_num) u"\n关注数：" str(self.following) u"\n粉丝数：" str(self.followers) result_header ) for i in range(1, self.weibo_num2 1): text = (str(i) ":" self.weibo_content[i - 1] "\n" u"发布时间：" self.publish_time[i - 1] "\n" u"点赞数：" str(self.up_num[i - 1]) u" 转发数：" str(self.retweet_num[i - 1]) u" 评论数：" str(self.comment_num[i - 1]) "\n\n" ) result = result text file_dir = os.path.split(os.path.realpath(__file__))[ 0] os.sep "weibo" if not os.path.isdir(file_dir): os.mkdir(file_dir) file_path = file_dir os.sep "%d" % self.user_id ".txt" f = open(file_path, "wb") f.write(result.encode(sys.stdout.encoding)) f.close() print u"微博写入文件完毕，保存路径:" file_path except Exception, e: print "Error: ", e traceback.print_exc() # 运行爬虫 def start(self): try: self.get_username() self.get_user_info() self.get_weibo_info() self.write_txt() print u"信息抓取完毕" print "****************************************************************" except Exception, e: print "Error: ", e def main(): try: # 使用实例,输入一个用户id，所有信息都会存储在wb实例中 user_id = 5982879020 # 可以改成任意合法的用户id（爬虫的微博id除外） filter = 1 # 值为0表示爬取全部微博（原创微博转发微博），值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类，创建微博实例wb wb.start() # 爬取微博信息 print u"用户名：" wb.username print u"全部微博数：" str(wb.weibo_num) print u"关注数：" str(wb.following) print u"粉丝数：" str(wb.followers) if wb.weibo_content: print u"最新/置顶微博为：" wb.weibo_content[0] print u"最新/置顶微博发布时间：" wb.publish_time[0] print u"最新/置顶微博获得赞数：" str(wb.up_num[0]) print u"最新/置顶微博获得转发数：" str(wb.retweet_num[0]) print u"最新/置顶微博获得评论数：" str(wb.comment_num[0]) except Exception, e: print "Error: ", e traceback.print_exc() if __name__ == "__main__": main(),

点此查看全文

图文教程