朴素贝叶斯(垃圾邮件分类)
邮箱训练集下载地址
邮件训练集下载地址https://pan.baidu.com/s/1uwDJJeJAr-DX82XlCRGR9Q
邮件训练集下载地址https://pan.baidu.com/s/1uwDJJeJAr-DX82XlCRGR9Q
import re import os from jieba import cut from itertools import chain from collections import Counter import numpy as np from sklearn.naive_bayes import MultinomialNB
def get_words(filename): """读取文本并过滤无效字符和长度为1的词""" words = [] with open(filename, 'r', encoding='utf-8') as fr: for line in fr: line = line.strip() # 过滤无效字符 line = re.sub(r'[.【】0-9、——。,!~\*]', '', line) # 使用jieba.cut()方法对文本切词处理 line = cut(line) # 过滤长度为1的词 line = filter(lambda word: len(word) > 1, line) words.extend(line) return words