{"id":5333,"date":"2022-04-02T19:34:13","date_gmt":"2022-04-02T11:34:13","guid":{"rendered":"https:\/\/egonlin.com\/?p=5333"},"modified":"2022-04-02T19:34:13","modified_gmt":"2022-04-02T11:34:13","slug":"07-07-%e7%88%ac%e8%99%abor%e6%95%b0%e6%8d%ae%e5%88%86%e6%9e%90%e6%93%8d%e4%bd%9c%e5%ae%9e%e4%be%8b","status":"publish","type":"post","link":"https:\/\/egonlin.com\/?p=5333","title":{"rendered":"07-07 \u722c\u866bor\u6570\u636e\u5206\u6790\u64cd\u4f5c\u5b9e\u4f8b"},"content":{"rendered":"<h1>\u722c\u866bor\u6570\u636e\u5206\u6790<\/h1>\n<p><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/egonlin.com\/wp-content\/uploads\/2022\/04\/\u722c\u866bor\u6570\u636e\u5206\u6790\u64cd\u4f5c\u5b9e\u4f8b-1.jpg'><img class=\"lazyload lazyload-style-2\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  data-original=\"https:\/\/egonlin.com\/wp-content\/uploads\/2022\/04\/\u722c\u866bor\u6570\u636e\u5206\u6790\u64cd\u4f5c\u5b9e\u4f8b-1.jpg\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" \/><\/div><\/p>\n<p>\u4f7f\u7528requests\u5e93\u722c\u53d6\u54d4\u54e9\u54d4\u54e9\u89c6\u9891\u5f39\u5e55\uff0c\u7ed3\u5408BeautifulSoup\u89e3\u6790\u722c\u53d6\u7684\u6570\u636e<\/p>\n<p>\u4f7f\u7528\u6570\u636e\u5206\u6790\u5e38\u7528\u5e93pandas\u3001numpy\u5bf9\u6570\u636e\u505a\u8fdb\u4e00\u6b65\u5904\u7406<\/p>\n<p>\u6700\u540e\u901a\u8fc7\u7ed3\u5df4\u5206\u8bcd\u4ee5\u53ca\u8bcd\u4e91\u5e93\u5c06\u5206\u6790\u7ed3\u679c\u5c55\u793a\u51fa\u6765<\/p>\n<h2>\u5bfc\u5165\u6a21\u5757<\/h2>\n<pre><code class=\"language-python\">import requests\nfrom bs4 import BeautifulSoup\nimport datetime\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport re\nimport jieba\nimport numpy as np\nfrom wordcloud import WordCloud, ImageColorGenerator<\/code><\/pre>\n<h2>\u722c\u53d6\u6570\u636e<\/h2>\n<pre><code class=\"language-python\">url = &quot;https:\/\/comment.bilibili.com\/92542241.xml&quot;\nr = requests.get(url)\nr.encoding = &#039;utf8&#039;\n\nsoup = BeautifulSoup(r.text,&#039;lxml&#039;)\nd = soup.find_all(&#039;d&#039;)\n\ndlst = []\nn = 0\nfor i in d:\n    n += 1\n    danmuku = {}\n    danmuku[&#039;\u5f39\u5e55&#039;] = i.text\n    danmuku[&#039;\u7f51\u5740&#039;] = url\n    danmuku[&#039;\u65f6\u95f4&#039;] = datetime.date.today()\n    dlst.append(danmuku)\n\ndf = pd.DataFrame(dlst)\n\nwith open(&#039;sign.txt&#039;,&#039;w&#039;,encoding=&#039;utf8&#039;) as f:\n    for text in df[&#039;\u5f39\u5e55&#039;].values:\n        pattern = re.compile(r&#039;[\u4e00-\u9fa5]+&#039;)\n        filter_data = re.findall(pattern,text)\n        f.write(&quot;&quot;.join(filter_data))<\/code><\/pre>\n<h2>\u6570\u636e\u5206\u6790<\/h2>\n<pre><code class=\"language-python\">with open(&#039;sign.txt&#039;, &#039;r&#039;, encoding=&#039;utf8&#039;) as f:\n    data = f.read()\n    segment = jieba.lcut(data)\n    words_df = pd.DataFrame({&quot;segment&quot;: segment})\n\nword_stat = words_df.groupby(by=[&#039;segment&#039;])[&#039;segment&#039;].agg({&#039;\u8ba1\u6570&#039;:np.size})\nwords_stat = word_stat.reset_index().sort_values(by=[&#039;\u8ba1\u6570&#039;],ascending=False)\n\ncolor_mask = imread(&#039;01.jpg&#039;)\n\nwordcloud = WordCloud(\n    # font_path=&quot;simhei.ttf&quot;,   # mac\u4e0a\u6ca1\u6709\u8be5\u5b57\u4f53\n    font_path=&quot;C:\\Windows\\Fonts\\simkai.ttf&quot;,\n    # \u8bbe\u7f6e\u5b57\u4f53\u53ef\u4ee5\u663e\u793a\u4e2d\u6587\n    background_color=&quot;white&quot;,  # \u80cc\u666f\u989c\u8272\n    max_words=3000,  # \u8bcd\u4e91\u663e\u793a\u7684\u6700\u5927\u8bcd\u6570\n    mask=color_mask,  # \u8bbe\u7f6e\u80cc\u666f\u56fe\u7247\n    max_font_size=200,  # \u5b57\u4f53\u6700\u5927\u503c\n    random_state=100,\n    width=1000, height=860, margin=2,\n    # \u8bbe\u7f6e\u56fe\u7247\u9ed8\u8ba4\u7684\u5927\u5c0f,\u4f46\u662f\u5982\u679c\u4f7f\u7528\u80cc\u666f\u56fe\u7247\u7684\u8bdd,                                                   # \u90a3\u4e48\u4fdd\u5b58\u7684\u56fe\u7247\u5927\u5c0f\u5c06\u4f1a\u6309\u7167\u5176\u5927\u5c0f\u4fdd\u5b58,margin\u4e3a\u8bcd\u8bed\u8fb9\u7f18\u8ddd\u79bb\n)\n\n# \u751f\u6210\u8bcd\u4e91, \u53ef\u4ee5\u7528generate\u8f93\u5165\u5168\u90e8\u6587\u672c,\u4e5f\u53ef\u4ee5\u6211\u4eec\u8ba1\u7b97\u597d\u8bcd\u9891\u540e\u4f7f\u7528generate_from_frequencies\u51fd\u6570\nword_frequence = {x[0]: x[1] for x in words_stat.head(500).values}\nword_frequence_dict = {}\nfor key in word_frequence:\n    word_frequence_dict[key] = word_frequence[key]\n\nwordcloud.generate_from_frequencies(word_frequence_dict)\n# \u4ece\u80cc\u666f\u56fe\u7247\u751f\u6210\u989c\u8272\u503c\n# image_colors = ImageColorGenerator(color_mask)\n# \u91cd\u65b0\u4e0a\u8272\n# wordcloud.recolor(color_func=image_colors)\n# \u4fdd\u5b58\u56fe\u7247\nwordcloud.to_file(&#039;output.png&#039;)\nplt.imshow(wordcloud)\nplt.axis(&quot;off&quot;)\nplt.show()<\/code><\/pre>\n<p><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/egonlin.com\/wp-content\/uploads\/2022\/04\/\u722c\u866bor\u6570\u636e\u5206\u6790\u64cd\u4f5c\u5b9e\u4f8b-2.png'><img class=\"lazyload lazyload-style-2\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  data-original=\"https:\/\/egonlin.com\/wp-content\/uploads\/2022\/04\/\u722c\u866bor\u6570\u636e\u5206\u6790\u64cd\u4f5c\u5b9e\u4f8b-2.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" \/><\/div><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u722c\u866bor\u6570\u636e\u5206\u6790 \u4f7f\u7528requests\u5e93\u722c\u53d6\u54d4\u54e9\u54d4\u54e9\u89c6\u9891\u5f39\u5e55\uff0c\u7ed3\u5408BeautifulSoup\u89e3\u6790\u722c\u53d6\u7684\u6570\u636e \u4f7f [&hellip;]<\/p>\n","protected":false},"author":3,"featured_media":5334,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":[],"categories":[371,384],"tags":[],"_links":{"self":[{"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/posts\/5333"}],"collection":[{"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/egonlin.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=5333"}],"version-history":[{"count":0,"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/posts\/5333\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/egonlin.com\/index.php?rest_route=\/wp\/v2\/media\/5334"}],"wp:attachment":[{"href":"https:\/\/egonlin.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=5333"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/egonlin.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=5333"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/egonlin.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=5333"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}