Browse Source

fix xml converter

Yurii Sokolovskyi 3 tháng trước cách đây
mục cha
commit
d534f16d82
5 tập tin đã thay đổi với 69 bổ sung58 xóa
  1. 2 0
      .gitignore
  2. 0 1
      src/main.rs
  3. 3 28
      src/templates/html.rs
  4. 28 0
      src/templates/util.rs
  5. 36 29
      src/templates/xml.rs

+ 2 - 0
.gitignore

@@ -1,3 +1,5 @@
 /target
 site/
 
+/.idea/.gitignore
+/.idea/vcs.xml

+ 0 - 1
src/main.rs

@@ -87,7 +87,6 @@ impl List {
         if Config::global().include_html {
             write(self.to_html(), &self.out_dir, "html");
         }
-        write_if_changed(&self.out_dir.join("atom.xml"), self.to_xml());
     }
 
     // Used with atom

+ 3 - 28
src/templates/html.rs

@@ -1,5 +1,5 @@
 use std::error::Error;
-use super::util::xml_safe as x;
+use super::util::{parse_email, xml_safe as x};
 use crate::models::*;
 use crate::templates::PAGE_SIZE;
 use crate::util::*;
@@ -208,18 +208,6 @@ impl MailAddress {
 }
 
 impl Thread {
-    fn extract_html_content(&self, part: &ParsedMail) -> Option<String> {
-        if part.ctype.mimetype == "text/html" {
-            return part.get_body().ok();
-        }
-        for subpart in &part.subparts {
-            if let Some(html) = self.extract_html_content(subpart) {
-                return Some(html);
-            }
-        }
-        None
-    }
-
     fn retrieve_attachments(parsed_mail: ParsedMail, dir_parent: PathBuf, file_name: String) -> io::Result<Vec<PathBuf>> {
         // lists of all paths to attachments
         let mut paths = Vec::new();
@@ -256,19 +244,6 @@ impl Thread {
         Ok(paths)
     }
 
-    fn parse_email(&self, parsed_mail: &ParsedMail) -> Result<String, Box<dyn Error>> {
-        if let Some(html_content) = self.extract_html_content(&parsed_mail) {
-            return Ok(html_content.replace("\r\n", ""));
-        } else if parsed_mail.ctype.mimetype.starts_with("text/plain") {
-            // Convert plain text to simple HTML
-            let plain_text = parsed_mail.get_body()?.replace("\r\n", "<br>");
-            let html_content = format!("<html><body>{}</body></html>", plain_text);
-            return Ok(html_content);
-        }
-
-        Err(From::from("No HTML or text content found"))
-    }
-
     pub fn to_html(&self, out_dir: PathBuf) -> String {
         fn extract_styles(html: &str) -> String {
             let style_pattern = Regex::new(r"(?is)<style[^>]*>(.*?)</style>").unwrap();
@@ -379,7 +354,7 @@ impl Thread {
                 }
             );
 
-            let data = match fs::read_to_string(&msg.original_path) {
+            let data = match fs::read_to_string(&msg.original_path.clone()) {
                 Ok(content) => content,
                 Err(e) => {
                     eprintln!("Error reading file: {}", e);
@@ -392,7 +367,7 @@ impl Thread {
                 Err(e) => return format!("Error parsing email: {}", e),
             };
 
-            let html = self.parse_email(&parsed_mail).unwrap_or_else(|_| msg.body.clone());
+            let html = parse_email(&parsed_mail).unwrap_or_else(|_| msg.body.clone());
             let styles = extract_styles(&html);
             let mut body_content = remove_style_tags(&extract_body(&html));
             body_content = purple_numbers(&*body_content, "#");

+ 28 - 0
src/templates/util.rs

@@ -1,3 +1,6 @@
+use std::error::Error;
+use mailparse::ParsedMail;
+
 // less efficient, easier api
 pub fn xml_safe(text: &str) -> String {
     // note we escape more than we need to
@@ -18,3 +21,28 @@ pub fn xml_safe(text: &str) -> String {
         })
         .collect::<String>()
 }
+
+pub fn extract_html_content(part: &ParsedMail) -> Option<String> {
+    if part.ctype.mimetype == "text/html" {
+        return part.get_body().ok();
+    }
+    for subpart in &part.subparts {
+        if let Some(html) = extract_html_content(subpart) {
+            return Some(html);
+        }
+    }
+    None
+}
+
+pub fn parse_email(parsed_mail: &ParsedMail) -> Result<String, Box<dyn Error>> {
+    if let Some(html_content) = extract_html_content(&parsed_mail) {
+        return Ok(html_content.replace("\r\n", ""));
+    } else if parsed_mail.ctype.mimetype.starts_with("text/plain") {
+        // Convert plain text to simple HTML
+        let plain_text = parsed_mail.get_body()?.replace("\r\n", "<br>");
+        let html_content = format!("<html><body>{}</body></html>", plain_text);
+        return Ok(html_content);
+    }
+
+    Err(From::from("No HTML or text content found"))
+}

+ 36 - 29
src/templates/xml.rs

@@ -1,8 +1,12 @@
-use super::util::xml_safe as x;
+use std::fs;
+use mail_parser::PartType::Html;
+use mailparse::parse_mail;
+use super::util::{parse_email, xml_safe as x};
 use crate::models::*;
 use crate::util::unformat_flowed;
 // use crate::templates::util::xml_safe;
 // use anyhow::{Context, Result};
+use scraper::{Html as scraper_html, Selector};
 
 fn feed(
     feed_title: &str,
@@ -58,10 +62,37 @@ fn message(
 impl StrMessage {
     pub fn to_xml(&self) -> String {
         let msg = self;
-        let body = match self.flowed {
-            true => unformat_flowed(&self.body),
-            false => self.body.clone(),
+
+        let data = match fs::read_to_string(self.original_path.clone()) {
+            Ok(content) => content,
+            Err(e) => {
+                eprintln!("Error reading file: {}", e);
+                return "Error loading content".to_string();
+            }
+        };
+        
+        let parsed_mail = match parse_mail(&data.as_bytes()) {
+            Ok(mail) => mail,
+            Err(e) => return format!("Error parsing email: {}", e),
         };
+        
+        let html = parse_email(&parsed_mail).unwrap_or_else(|_| msg.body.clone());
+        
+        let document = scraper_html::parse_document(&*html);
+        let body = Selector::parse("body").unwrap();
+        
+        // Traverse the parsed HTML and extract all text from the body
+        let text = document.select(&body)
+            .next().unwrap()
+            .text()
+            .collect::<Vec<_>>()
+            .join(" ");
+        
+        
+        // let body = match self.flowed {
+        //     true => unformat_flowed(&self.body),
+        //     false => self.body.clone(),
+        // };
         message(
             &x(&msg.subject),
             &x(&self.url),
@@ -69,31 +100,7 @@ impl StrMessage {
             &msg.received.to_rfc3339(),
             &x(&msg.from.clone().name.unwrap_or(msg.from.clone().address)),
             &x(&msg.from.address),
-            &x(&body),
-        )
-    }
-}
-
-// TODO dedup
-impl List {
-    pub fn to_xml(&self) -> String {
-        let mut entry_list = String::new();
-        for msg in &self.recent_messages {
-            entry_list.push_str(&msg.to_xml());
-        }
-        // Sometimes its unclear whether to do stuff like this in models.rs or here. could refactor
-        let last_updated = self
-            .recent_messages
-            .get(0)
-            .map(|x| x.received.clone())
-            .unwrap_or(crate::util::EPOCH);
-        feed(
-            &self.config.name,
-            &self.url,
-            &last_updated.to_rfc3339(),
-            &self.config.email,
-            &self.config.email,
-            &entry_list,
+            &x(&text),
         )
     }
 }