Compare commits

..

2 commits

Author SHA1 Message Date
3d28f93f51
feat(mdhtml): support cloaker fn, cleanup 2024-07-15 23:47:29 +02:00
b43431cb03
feat: cloak media urls 2024-07-15 23:47:18 +02:00
5 changed files with 84 additions and 40 deletions

View file

@ -21,7 +21,7 @@ pub async fn cloak(ctx: upub::Context, post_contents: bool) -> Result<(), Reques
if post_contents {
let mut stream = upub::model::object::Entity::find()
.filter(upub::model::object::Column::Content.is_not_null())
.filter(upub::model::object::Column::Content.like("<img"))
.select_only()
.select_column(upub::model::object::Column::Internal)
.select_column(upub::model::object::Column::Content)
@ -30,7 +30,7 @@ pub async fn cloak(ctx: upub::Context, post_contents: bool) -> Result<(), Reques
.await?;
while let Some((internal, content)) = stream.try_next().await? {
let sanitized = mdhtml::safe_html(&content);
let sanitized = ctx.sanitize(&content);
if sanitized != content {
let model = upub::model::object::ActiveModel {
internal: Unchanged(internal),

View file

@ -28,6 +28,7 @@ serde-inline-default = "0.2"
toml = "0.8"
uriproxy = { path = "../../utils/uriproxy" }
httpsign = { path = "../../utils/httpsign/" }
mdhtml = { path = "../../utils/mdhtml/" }
jrd = "0.1"
tracing = "0.1"
sea-orm = { version = "0.12", features = ["macros"] }

View file

@ -28,10 +28,34 @@ pub trait Cloaker {
Some(url)
}
fn cloaked(&self, url: &str) -> String;
}
impl Cloaker for crate::Context {
fn secret(&self) -> &str {
&self.cfg().security.proxy_secret
}
fn cloaked(&self, url: &str) -> String {
let (sig, url) = self.cloak(url);
crate::url!(self, "/proxy/{sig}/{url}")
}
}
// TODO this shouldnt sit in bare context.rs but also having it here is weird!!
impl crate::Context {
pub fn sanitize(&self, text: &str) -> String {
let _ctx = self.clone();
mdhtml::Sanitizer::new(
Box::new(move |txt| {
if _ctx.is_local(txt) {
txt.to_string()
} else {
_ctx.cloaked(txt)
}
})
)
.html(text)
}
}

View file

@ -1,7 +1,7 @@
use apb::{field::OptionalString, Collection, Document, Endpoints, Node, Object, PublicKey};
use sea_orm::{sea_query::Expr, ActiveModelTrait, ActiveValue::{Unchanged, NotSet, Set}, ColumnTrait, ConnectionTrait, DbErr, EntityTrait, IntoActiveModel, QueryFilter};
use super::Fetcher;
use super::{Cloaker, Fetcher};
#[derive(Debug, thiserror::Error)]
pub enum NormalizerError {
@ -27,10 +27,9 @@ impl Normalizer for crate::Context {
async fn insert_object(&self, object: impl apb::Object, tx: &impl ConnectionTrait) -> Result<crate::model::object::Model, NormalizerError> {
let mut object_model = AP::object(&object)?;
// TOO should we make sure content only contains a safe subset of html ? frontend does it too
// if let Some(content) = object_model.content {
// object_model.content = Some(mdhtml::safe_html(&content));
// }
if let Some(content) = object_model.content {
object_model.content = Some(self.sanitize(&content));
}
// fix context for remote posts
// > if any link is broken or we get rate limited, the whole insertion fails which is
@ -79,14 +78,19 @@ impl Normalizer for crate::Context {
},
Node::Link(l) => crate::model::attachment::ActiveModel {
internal: sea_orm::ActiveValue::NotSet,
url: Set(l.href().unwrap_or_default().to_string()),
url: Set(self.cloaked(l.href().unwrap_or_default())),
object: Set(object_model.internal),
document_type: Set(apb::DocumentType::Page),
name: Set(l.name().str()),
media_type: Set(l.media_type().unwrap_or("link").to_string()),
},
Node::Object(o) =>
AP::attachment_q(o.as_document()?, object_model.internal, None)?,
Node::Object(o) => {
let mut model = AP::attachment_q(o.as_document()?, object_model.internal, None)?;
if let Set(u) | Unchanged(u) = model.url {
model.url = Set(self.cloaked(&u));
}
model
},
};
crate::model::attachment::Entity::insert(attachment_model)
.exec(tx)

View file

@ -1,15 +1,51 @@
use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer};
use html5ever::{tendril::SliceExt, tokenizer::{BufferQueue, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer}};
use comrak::{markdown_to_html, Options};
/// In our case, our sink only contains a tokens vector
#[derive(Debug, Clone, Default)]
struct Sink {
pub media_proxy: Option<String>,
pub type Cloaker = Box<dyn Fn(&str) -> String>;
#[derive(Default)]
pub struct Sanitizer {
pub cloaker: Option<Cloaker>,
pub buffer: String,
}
impl TokenSink for Sink {
pub fn safe_html(text: &str) -> String {
Sanitizer::default().html(text)
}
pub fn safe_markdown(text: &str) -> String {
Sanitizer::default().markdown(text)
}
impl Sanitizer {
pub fn new(cloak: Cloaker) -> Self {
Self {
buffer: String::default(),
cloaker: Some(cloak),
}
}
pub fn markdown(self, text: &str) -> String {
self.html(&markdown_to_html(text, &Options::default()))
}
pub fn html(self, text: &str) -> String {
let mut input = BufferQueue::default();
input.push_back(text.to_tendril().try_reinterpret().unwrap());
let mut tok = Tokenizer::new(self, Default::default());
let _ = tok.feed(&mut input);
if !input.is_empty() {
tracing::warn!("buffer input not empty after processing html");
}
tok.end();
tok.sink.buffer
}
}
impl TokenSink for Sanitizer {
type Handle = ();
/// Each processed token will be handled by this method
@ -38,8 +74,8 @@ impl TokenSink for Sink {
"img" => for attr in tag.attrs {
match attr.name.local.as_ref() {
"src" => {
let src = if let Some(ref proxy) = self.media_proxy {
format!("{proxy}{}", attr.value.as_ref())
let src = if let Some(ref cloak) = self.cloaker {
cloak(attr.value.as_ref())
} else {
attr.value.to_string()
};
@ -86,24 +122,3 @@ impl TokenSink for Sink {
TokenSinkResult::Continue
}
}
pub fn safe_markdown(text: &str) -> String {
safe_html(&markdown_to_html(text, &Options::default()))
}
pub fn safe_html(text: &str) -> String {
let mut input = BufferQueue::default();
input.push_back(text.to_tendril().try_reinterpret().unwrap());
let sink = Sink::default();
let mut tok = Tokenizer::new(sink, Default::default());
let _ = tok.feed(&mut input);
if !input.is_empty() {
tracing::warn!("buffer input not empty after processing html");
}
tok.end();
tok.sink.buffer
}