email_client.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. from imapclient import IMAPClient
  2. import email
  3. from email.utils import parsedate_to_datetime, getaddresses, parseaddr
  4. from email.header import decode_header
  5. from datetime import datetime
  6. from bs4 import BeautifulSoup
  7. import re
  8. class EmailClient:
  9. def __init__(self, email_config: dict):
  10. self.config = email_config
  11. self.imap_server = None
  12. def connect(self):
  13. # 连接到 IMAP 服务器
  14. self.imap_server = IMAPClient(self.config['receive_server_address'], self.config['receive_server_port'])
  15. # 登录
  16. self.imap_server.login(self.config['email_account'], self.config['email_password'])
  17. # 设置标识信息
  18. self.imap_server.id_({"name": "IMAPClient", "version": "2.1.0"})
  19. def disconnect(self):
  20. """断开 IMAP 连接"""
  21. if self.imap_server:
  22. self.imap_server.logout()
  23. self.imap_server = None
  24. def extract_links_from_html(self, html_content):
  25. """
  26. 从HTML内容中提取超链接
  27. :param html_content: HTML内容
  28. :return: 链接列表
  29. """
  30. links = []
  31. try:
  32. soup = BeautifulSoup(html_content, 'html.parser')
  33. # 提取所有带href属性的<a>标签
  34. for link in soup.find_all('a', href=True):
  35. links.append({
  36. 'url': link['href'],
  37. 'text': link.get_text(strip=True)
  38. })
  39. except Exception as e:
  40. print(f"解析HTML链接时出错: {e}")
  41. return links
  42. def extract_links_from_text(self, text_content):
  43. """
  44. 从文本内容中提取链接
  45. :param text_content: 文本内容
  46. :return: 链接列表
  47. """
  48. # 使用正则表达式匹配URL
  49. url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
  50. urls = re.findall(url_pattern, text_content)
  51. return [{'url': url, 'text': url} for url in urls]
  52. def decode_mime_header(self, header):
  53. """解码 MIME 编码的邮件头"""
  54. if header is None:
  55. return ""
  56. decoded_parts = decode_header(header)
  57. decoded_str = ""
  58. for part, charset in decoded_parts:
  59. if isinstance(part, bytes):
  60. try:
  61. # 尝试使用指定的字符集解码
  62. if charset:
  63. decoded_str += part.decode(charset, errors='replace')
  64. else:
  65. # 如果没有指定字符集,尝试常见字符集
  66. try:
  67. decoded_str += part.decode('utf-8', errors='replace')
  68. except:
  69. decoded_str += part.decode('latin-1', errors='replace')
  70. except Exception as e:
  71. # 如果所有解码都失败,使用替代表示
  72. decoded_str += part.decode('utf-8', errors='replace')
  73. else:
  74. decoded_str += part
  75. return decoded_str
  76. def check_new_email(self, last_check_uid=None):
  77. """
  78. 检查新邮件
  79. :return:
  80. """
  81. if not self.imap_server:
  82. self.connect()
  83. self.imap_server.select_folder('INBOX')
  84. uids = self.imap_server.search('UNSEEN')
  85. if last_check_uid is not None:
  86. new_uid = [uid for uid in uids if uid > last_check_uid]
  87. else:
  88. new_uid = uids
  89. email_details = []
  90. if new_uid:
  91. response = self.imap_server.fetch(new_uid, ['BODY[]'])
  92. for msgid, data in sorted(response.items(), key=lambda x: x[0]):
  93. try:
  94. raw_email = data[b'BODY[]']
  95. email_message = email.message_from_bytes(raw_email)
  96. # 使用解码函数处理邮件头
  97. subject = self.decode_mime_header(email_message.get('Subject'))
  98. from_header = self.decode_mime_header(email_message.get('From'))
  99. # 解析日期
  100. date_str = email_message.get('Date')
  101. date_obj = None
  102. formatted_date = ""
  103. if date_str:
  104. try:
  105. date_obj = parsedate_to_datetime(date_str)
  106. formatted_date = date_obj.strftime('%Y-%m-%d %H:%M:%S')
  107. except Exception as e:
  108. print(f"日期解析错误: {e}, 原始值: {date_str}")
  109. formatted_date = date_str # 保留原始日期字符串
  110. # 解析发件人
  111. sender_name, sender_email = parseaddr(from_header) if from_header else ('', '')
  112. # 解析收件人信息
  113. to_header = self.decode_mime_header(email_message.get('To', ''))
  114. recipient_name, recipient_email = parseaddr(to_header) if to_header else ('', '')
  115. # 提取邮件内容
  116. text_content = ""
  117. html_content = ""
  118. # 统一的内容提取函数
  119. def extract_content(part):
  120. charset = part.get_content_charset('utf-8')
  121. payload = part.get_payload(decode=True)
  122. try:
  123. content = payload.decode(charset, errors='replace')
  124. except Exception as decode_error:
  125. print(f"解码错误: {decode_error}, 尝试其他编码")
  126. try:
  127. content = payload.decode('latin-1', errors='replace')
  128. except:
  129. content = payload.decode('utf-8', errors='replace')
  130. return content
  131. # 处理邮件内容
  132. if email_message.is_multipart():
  133. for part in email_message.walk():
  134. content_type = part.get_content_type()
  135. # 跳过附件
  136. content_disposition = part.get("Content-Disposition", "")
  137. if "attachment" in content_disposition:
  138. continue
  139. # 处理正文内容
  140. if content_type == "text/plain":
  141. text_content = extract_content(part)
  142. elif content_type == "text/html":
  143. html_content = extract_content(part)
  144. else:
  145. content = extract_content(email_message)
  146. content_type = email_message.get_content_type()
  147. if "html" in content_type.lower():
  148. html_content = content
  149. else:
  150. text_content = content
  151. # 确定主要内容和类型
  152. if html_content:
  153. content = html_content
  154. content_type = "html"
  155. elif text_content:
  156. content = text_content
  157. content_type = "text"
  158. else:
  159. content = ""
  160. content_type = ""
  161. email_details.append({
  162. 'uid': msgid,
  163. 'subject': subject, # 使用解码后的主题
  164. 'sender': from_header, # 使用解码后的发件人信息
  165. 'sender_name': sender_name,
  166. 'sender_email': sender_email,
  167. 'recipient': to_header, # 使用解码后的收件人信息
  168. 'recipient_name': recipient_name,
  169. 'recipient_email': recipient_email,
  170. 'date': formatted_date,
  171. 'content': content,
  172. 'content_type': content_type,
  173. })
  174. except Exception as e:
  175. print(f"处理邮件 {msgid} 时出错: {e}")
  176. email_details.append({
  177. 'uid': msgid,
  178. 'error': str(e)
  179. })
  180. return email_details
  181. # 添加邮件头解码函数