Fetch and save entry HTML content with metadata

And render the extracted HTML on the entry page in the frontend.
This commit is contained in:
Tyler Hallada 2023-06-07 01:06:03 -04:00
parent 786f3a194f
commit 3f29138bd1
7 changed files with 516 additions and 12 deletions

488
Cargo.lock generated
View File

@ -2,6 +2,12 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 3
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]] [[package]]
name = "ahash" name = "ahash"
version = "0.7.6" version = "0.7.6"
@ -22,6 +28,21 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "alloc-no-stdlib"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
[[package]]
name = "alloc-stdlib"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
dependencies = [
"alloc-no-stdlib",
]
[[package]] [[package]]
name = "android_system_properties" name = "android_system_properties"
version = "0.1.5" version = "0.1.5"
@ -75,6 +96,43 @@ version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64cb94155d965e3d37ffbbe7cc5b82c3dd79dd33bd48e536f73d2cfb8d85506f" checksum = "64cb94155d965e3d37ffbbe7cc5b82c3dd79dd33bd48e536f73d2cfb8d85506f"
[[package]]
name = "article_scraper"
version = "2.0.0-alpha.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b0058ad303af21c937736589e6f639eb8b3c64ef3f09e7022e3be8885ae93e8"
dependencies = [
"base64 0.21.0",
"chrono",
"encoding_rs",
"escaper",
"futures",
"image",
"libxml",
"log",
"once_cell",
"regex",
"reqwest",
"rust-embed",
"thiserror",
"tokio",
"url",
]
[[package]]
name = "async-compression"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a"
dependencies = [
"brotli",
"flate2",
"futures-core",
"memchr",
"pin-project-lite",
"tokio",
]
[[package]] [[package]]
name = "async-trait" name = "async-trait"
version = "0.1.68" version = "0.1.68"
@ -162,6 +220,12 @@ version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
[[package]]
name = "bit_field"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "1.3.2"
@ -177,12 +241,39 @@ dependencies = [
"generic-array", "generic-array",
] ]
[[package]]
name = "brotli"
version = "3.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
name = "brotli-decompressor"
version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
]
[[package]] [[package]]
name = "bumpalo" name = "bumpalo"
version = "3.12.1" version = "3.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
[[package]]
name = "bytemuck"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.4.3" version = "1.4.3"
@ -233,6 +324,12 @@ dependencies = [
"unicode-width", "unicode-width",
] ]
[[package]]
name = "color_quant"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
[[package]] [[package]]
name = "core-foundation" name = "core-foundation"
version = "0.9.3" version = "0.9.3"
@ -265,6 +362,7 @@ dependencies = [
"ansi-to-html", "ansi-to-html",
"anyhow", "anyhow",
"argh", "argh",
"article_scraper",
"axum", "axum",
"bytes", "bytes",
"chrono", "chrono",
@ -304,6 +402,15 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9cace84e55f07e7301bae1c519df89cdad8cc3cd868413d3fdbdeca9ff3db484" checksum = "9cace84e55f07e7301bae1c519df89cdad8cc3cd868413d3fdbdeca9ff3db484"
[[package]]
name = "crc32fast"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "crossbeam-channel" name = "crossbeam-channel"
version = "0.5.8" version = "0.5.8"
@ -314,6 +421,30 @@ dependencies = [
"crossbeam-utils", "crossbeam-utils",
] ]
[[package]]
name = "crossbeam-deque"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset",
"scopeguard",
]
[[package]] [[package]]
name = "crossbeam-queue" name = "crossbeam-queue"
version = "0.3.8" version = "0.3.8"
@ -333,6 +464,12 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]] [[package]]
name = "crypto-common" name = "crypto-common"
version = "0.1.6" version = "0.1.6"
@ -474,6 +611,12 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "entities"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
[[package]] [[package]]
name = "errno" name = "errno"
version = "0.3.1" version = "0.3.1"
@ -495,12 +638,37 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "escaper"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a53eb97b7349ba1bdb31839eceafe9aaae8f1d8d944dc589b67fb0b26e1c1666"
dependencies = [
"entities",
]
[[package]] [[package]]
name = "event-listener" name = "event-listener"
version = "2.5.3" version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "exr"
version = "1.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "279d3efcc55e19917fff7ab3ddd6c14afb6a90881a0078465196fe2f99d08c56"
dependencies = [
"bit_field",
"flume",
"half",
"lebe",
"miniz_oxide",
"rayon-core",
"smallvec",
"zune-inflate",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "1.9.0" version = "1.9.0"
@ -510,6 +678,15 @@ dependencies = [
"instant", "instant",
] ]
[[package]]
name = "fdeflate"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d329bdeac514ee06249dabc27877490f17f5d371ec693360768b838e19f3ae10"
dependencies = [
"simd-adler32",
]
[[package]] [[package]]
name = "feed-rs" name = "feed-rs"
version = "1.3.0" version = "1.3.0"
@ -540,6 +717,29 @@ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
[[package]]
name = "flate2"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "flume"
version = "0.10.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577"
dependencies = [
"futures-core",
"futures-sink",
"nanorand",
"pin-project",
"spin",
]
[[package]] [[package]]
name = "fnv" name = "fnv"
version = "1.0.7" version = "1.0.7"
@ -579,6 +779,21 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "futures"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.28" version = "0.3.28"
@ -595,6 +810,17 @@ version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
[[package]]
name = "futures-executor"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]] [[package]]
name = "futures-intrusive" name = "futures-intrusive"
version = "0.4.2" version = "0.4.2"
@ -606,6 +832,23 @@ dependencies = [
"parking_lot 0.11.2", "parking_lot 0.11.2",
] ]
[[package]]
name = "futures-io"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
[[package]]
name = "futures-macro"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.15",
]
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.28" version = "0.3.28"
@ -624,11 +867,16 @@ version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
dependencies = [ dependencies = [
"futures-channel",
"futures-core", "futures-core",
"futures-io",
"futures-macro",
"futures-sink", "futures-sink",
"futures-task", "futures-task",
"memchr",
"pin-project-lite", "pin-project-lite",
"pin-utils", "pin-utils",
"slab",
] ]
[[package]] [[package]]
@ -648,8 +896,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"js-sys",
"libc", "libc",
"wasi 0.11.0+wasi-snapshot-preview1", "wasi 0.11.0+wasi-snapshot-preview1",
"wasm-bindgen",
]
[[package]]
name = "gif"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80792593675e051cf94a4b111980da2ba60d4a83e43e0048c5693baab3977045"
dependencies = [
"color_quant",
"weezl",
] ]
[[package]] [[package]]
@ -671,6 +931,15 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "half"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0"
dependencies = [
"crunchy",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.12.3" version = "0.12.3"
@ -871,6 +1140,25 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"
[[package]]
name = "image"
version = "0.24.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527909aa81e20ac3a44803521443a765550f09b5130c2c2fa1ea59c2f8f50a3a"
dependencies = [
"bytemuck",
"byteorder",
"color_quant",
"exr",
"gif",
"jpeg-decoder",
"num-rational",
"num-traits",
"png",
"qoi",
"tiff",
]
[[package]] [[package]]
name = "indexmap" name = "indexmap"
version = "1.9.3" version = "1.9.3"
@ -943,6 +1231,15 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "jpeg-decoder"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc0000e42512c92e31c2252315bda326620a4e034105e900c98ec492fa077b3e"
dependencies = [
"rayon",
]
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.61" version = "0.3.61"
@ -978,12 +1275,29 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "lebe"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.143" version = "0.2.143"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edc207893e85c5d6be840e969b496b53d94cec8be2d501b214f50daa97fa8024" checksum = "edc207893e85c5d6be840e969b496b53d94cec8be2d501b214f50daa97fa8024"
[[package]]
name = "libxml"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca310e4db05e9b7386ad0734975fabc912b66f6bc50a98f525e690822da1ee0e"
dependencies = [
"libc",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "link-cplusplus" name = "link-cplusplus"
version = "1.0.8" version = "1.0.8"
@ -1078,6 +1392,15 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memoffset"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
dependencies = [
"autocfg",
]
[[package]] [[package]]
name = "mime" name = "mime"
version = "0.3.17" version = "0.3.17"
@ -1090,6 +1413,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "miniz_oxide"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [
"adler",
"simd-adler32",
]
[[package]] [[package]]
name = "mio" name = "mio"
version = "0.8.6" version = "0.8.6"
@ -1102,6 +1435,15 @@ dependencies = [
"windows-sys 0.45.0", "windows-sys 0.45.0",
] ]
[[package]]
name = "nanorand"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
dependencies = [
"getrandom",
]
[[package]] [[package]]
name = "native-tls" name = "native-tls"
version = "0.2.11" version = "0.2.11"
@ -1168,6 +1510,17 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "num-rational"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.15" version = "0.2.15"
@ -1341,6 +1694,19 @@ version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "png"
version = "0.17.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aaeebc51f9e7d2c150d3f3bfeb667f2aa985db5ef1e3d212847bdedb488beeaa"
dependencies = [
"bitflags",
"crc32fast",
"fdeflate",
"flate2",
"miniz_oxide",
]
[[package]] [[package]]
name = "ppv-lite86" name = "ppv-lite86"
version = "0.2.17" version = "0.2.17"
@ -1380,6 +1746,15 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "qoi"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001"
dependencies = [
"bytemuck",
]
[[package]] [[package]]
name = "quick-xml" name = "quick-xml"
version = "0.27.1" version = "0.27.1"
@ -1429,6 +1804,28 @@ dependencies = [
"getrandom", "getrandom",
] ]
[[package]]
name = "rayon"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"num_cpus",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.16" version = "0.2.16"
@ -1496,6 +1893,7 @@ version = "0.11.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91"
dependencies = [ dependencies = [
"async-compression",
"base64 0.21.0", "base64 0.21.0",
"bytes", "bytes",
"encoding_rs", "encoding_rs",
@ -1519,14 +1917,50 @@ dependencies = [
"serde_urlencoded", "serde_urlencoded",
"tokio", "tokio",
"tokio-native-tls", "tokio-native-tls",
"tokio-util",
"tower-service", "tower-service",
"url", "url",
"wasm-bindgen", "wasm-bindgen",
"wasm-bindgen-futures", "wasm-bindgen-futures",
"wasm-streams",
"web-sys", "web-sys",
"winreg", "winreg",
] ]
[[package]]
name = "rust-embed"
version = "6.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b68543d5527e158213414a92832d2aab11a84d2571a5eb021ebe22c43aab066"
dependencies = [
"rust-embed-impl",
"rust-embed-utils",
"walkdir",
]
[[package]]
name = "rust-embed-impl"
version = "6.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d4e0f0ced47ded9a68374ac145edd65a6c1fa13a96447b873660b2a568a0fd7"
dependencies = [
"proc-macro2",
"quote",
"rust-embed-utils",
"syn 1.0.109",
"walkdir",
]
[[package]]
name = "rust-embed-utils"
version = "7.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512b0ab6853f7e14e3c8754acb43d6f748bb9ced66aa5915a6553ac8213f7731"
dependencies = [
"sha2",
"walkdir",
]
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "0.37.19" version = "0.37.19"
@ -1726,6 +2160,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "simd-adler32"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "238abfbb77c1915110ad968465608b68e869e0772622c9656714e73e5a1a522f"
[[package]] [[package]]
name = "siphasher" name = "siphasher"
version = "0.3.10" version = "0.3.10"
@ -1757,6 +2197,15 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
dependencies = [
"lock_api",
]
[[package]] [[package]]
name = "sqlformat" name = "sqlformat"
version = "0.2.1" version = "0.2.1"
@ -1962,6 +2411,17 @@ dependencies = [
"once_cell", "once_cell",
] ]
[[package]]
name = "tiff"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7449334f9ff2baf290d55d73983a7d6fa15e01198faef72af07e2a8db851e471"
dependencies = [
"flate2",
"jpeg-decoder",
"weezl",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.1.45" version = "0.1.45"
@ -2445,6 +2905,19 @@ version = "0.2.84"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
[[package]]
name = "wasm-streams"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bbae3363c08332cadccd13b67db371814cd214c2524020932f0804b8cf7c078"
dependencies = [
"futures-util",
"js-sys",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.61" version = "0.3.61"
@ -2455,6 +2928,12 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "weezl"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9193164d4de03a926d909d3bc7c30543cecb35400c02114792c2cae20d5e2dbb"
[[package]] [[package]]
name = "whoami" name = "whoami"
version = "1.4.0" version = "1.4.0"
@ -2660,3 +3139,12 @@ checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d"
dependencies = [ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "zune-inflate"
version = "0.2.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02"
dependencies = [
"simd-adler32",
]

View File

@ -14,6 +14,7 @@ path = "src/lib.rs"
ansi-to-html = "0.1" ansi-to-html = "0.1"
anyhow = "1" anyhow = "1"
argh = "0.1" argh = "0.1"
article_scraper = "2.0.0-alpha.0"
axum = "0.6" axum = "0.6"
bytes = "1.4" bytes = "1.4"
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }

View File

@ -18,6 +18,7 @@ CREATE TABLE IF NOT EXISTS "entries" (
"title" VARCHAR(255), "title" VARCHAR(255),
"url" VARCHAR(2048) NOT NULL, "url" VARCHAR(2048) NOT NULL,
"description" TEXT, "description" TEXT,
"html_content" TEXT,
"feed_id" INTEGER REFERENCES "feeds"(id) NOT NULL, "feed_id" INTEGER REFERENCES "feeds"(id) NOT NULL,
"created_at" timestamp(3) NOT NULL, "created_at" timestamp(3) NOT NULL,
"updated_at" timestamp(3) NOT NULL, "updated_at" timestamp(3) NOT NULL,

View File

@ -98,8 +98,6 @@ pub async fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
info!("hello?");
match args.commands { match args.commands {
Commands::AddFeed(args) => { Commands::AddFeed(args) => {
let feed = create_feed( let feed = create_feed(
@ -125,6 +123,7 @@ pub async fn main() -> Result<()> {
title: args.title, title: args.title,
url: args.url, url: args.url,
description: args.description, description: args.description,
html_content: None,
feed_id: args.feed_id, feed_id: args.feed_id,
}, },
) )

View File

@ -1,6 +1,6 @@
use axum::extract::{State, Path}; use axum::extract::{State, Path};
use axum::response::Response; use axum::response::Response;
use maud::html; use maud::{html, PreEscaped};
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Result; use crate::error::Result;
@ -12,7 +12,7 @@ pub async fn get(Path(id): Path<i32>, State(pool): State<PgPool>, layout: Layout
Ok(layout.render(html! { Ok(layout.render(html! {
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
h1 { a href=(entry.url) { (title) } } h1 { a href=(entry.url) { (title) } }
@let description = entry.description.unwrap_or_else(|| "No description".to_string()); @let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
p { (description) } (PreEscaped(content))
})) }))
} }

View File

@ -1,5 +1,6 @@
use article_scraper::ArticleScraper;
use feed_rs::parser; use feed_rs::parser;
use reqwest::Client; use reqwest::{Client, Url};
use sqlx::PgPool; use sqlx::PgPool;
use tracing::{info, warn}; use tracing::{info, warn};
@ -9,18 +10,23 @@ use crate::models::entry::{upsert_entries, CreateEntry};
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the /// For every feed in the database, fetches the feed, parses it, and saves new entries to the
/// database. /// database.
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> { pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new(); let client = Client::new();
let feeds = get_feeds(pool).await?; let feeds = get_feeds(pool).await?;
for feed in feeds { for feed in feeds {
info!("Fetching feed {}: {}", feed.id, feed.url);
let bytes = client.get(feed.url).send().await?.bytes().await?; let bytes = client.get(feed.url).send().await?.bytes().await?;
let parsed_feed = parser::parse(&bytes[..])?; let parsed_feed = parser::parse(&bytes[..])?;
let mut payload = Vec::with_capacity(parsed_feed.entries.len()); let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries { for entry in parsed_feed.entries {
if let Some(link) = entry.links.get(0) { if let Some(link) = entry.links.get(0) {
info!("Fetching entry article: {}", link.href);
let article = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await?;
let entry = CreateEntry { let entry = CreateEntry {
title: entry.title.map(|t| t.content), title: entry.title.map(|t| t.content),
url: link.href.clone(), url: link.href.clone(),
description: entry.summary.map(|s| s.content), description: entry.summary.map(|s| s.content),
html_content: article.get_content(),
feed_id: feed.id, feed_id: feed.id,
}; };
payload.push(entry); payload.push(entry);

View File

@ -11,6 +11,7 @@ pub struct Entry {
pub title: Option<String>, pub title: Option<String>,
pub url: String, pub url: String,
pub description: Option<String>, pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: i32, pub feed_id: i32,
pub created_at: NaiveDateTime, pub created_at: NaiveDateTime,
pub updated_at: NaiveDateTime, pub updated_at: NaiveDateTime,
@ -25,6 +26,7 @@ pub struct CreateEntry {
pub url: String, pub url: String,
#[validate(length(max = 524288))] #[validate(length(max = 524288))]
pub description: Option<String>, pub description: Option<String>,
pub html_content: Option<String>,
#[validate(range(min = 1))] #[validate(range(min = 1))]
pub feed_id: i32, pub feed_id: i32,
} }
@ -52,13 +54,14 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"INSERT INTO entries ( "INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at title, url, description, html_content, feed_id, created_at, updated_at
) VALUES ( ) VALUES (
$1, $2, $3, $4, now(), now() $1, $2, $3, $4, $5, now(), now()
) RETURNING *", ) RETURNING *",
payload.title, payload.title,
payload.url, payload.url,
payload.description, payload.description,
payload.html_content,
payload.feed_id, payload.feed_id,
) )
.fetch_one(pool) .fetch_one(pool)
@ -77,23 +80,26 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len());
payload.iter().map(|entry| { payload.iter().map(|entry| {
titles.push(entry.title.clone()); titles.push(entry.title.clone());
urls.push(entry.url.clone()); urls.push(entry.url.clone());
descriptions.push(entry.description.clone()); descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id); feed_ids.push(entry.feed_id);
entry.validate() entry.validate()
}).collect::<Result<Vec<()>, ValidationErrors>>()?; }).collect::<Result<Vec<()>, ValidationErrors>>()?;
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"INSERT INTO entries ( "INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at title, url, description, html_content, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::int[]) ) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[], $5::int[])
RETURNING *", RETURNING *",
titles.as_slice() as &[Option<String>], titles.as_slice() as &[Option<String>],
urls.as_slice(), urls.as_slice(),
descriptions.as_slice() as &[Option<String>], descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(), feed_ids.as_slice(),
) )
.fetch_all(pool) .fetch_all(pool)
@ -112,24 +118,27 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len());
payload.iter().map(|entry| { payload.iter().map(|entry| {
titles.push(entry.title.clone()); titles.push(entry.title.clone());
urls.push(entry.url.clone()); urls.push(entry.url.clone());
descriptions.push(entry.description.clone()); descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id); feed_ids.push(entry.feed_id);
entry.validate() entry.validate()
}).collect::<Result<Vec<()>, ValidationErrors>>()?; }).collect::<Result<Vec<()>, ValidationErrors>>()?;
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"INSERT INTO entries ( "INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at title, url, description, html_content, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::int[]) ) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[], $5::int[])
ON CONFLICT DO NOTHING ON CONFLICT DO NOTHING
RETURNING *", RETURNING *",
titles.as_slice() as &[Option<String>], titles.as_slice() as &[Option<String>],
urls.as_slice(), urls.as_slice(),
descriptions.as_slice() as &[Option<String>], descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(), feed_ids.as_slice(),
) )
.fetch_all(pool) .fetch_all(pool)