parse web links

This commit is contained in:
Evgeny Poberezkin 2025-05-11 23:07:09 +01:00
parent 13e19fd574
commit f0e46fa6f6
No known key found for this signature in database
GPG key ID: 494BDDD9A28B577D
5 changed files with 139 additions and 40 deletions

View file

@ -277,6 +277,7 @@ library
, tls >=1.9.0 && <1.10
, unliftio ==0.2.*
, unliftio-core ==0.2.*
, uri-bytestring >=0.3.3.1 && <0.4
, uuid ==1.3.*
, zip ==2.0.*
default-language: Haskell2010

View file

@ -1,4 +1,5 @@
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NamedFieldPuns #-}
@ -16,7 +17,8 @@ import qualified Data.Aeson as J
import qualified Data.Aeson.TH as JQ
import Data.Attoparsec.Text (Parser)
import qualified Data.Attoparsec.Text as A
import Data.Char (isDigit, isPunctuation, isSpace)
import qualified Data.ByteString.Char8 as B
import Data.Char (isAlpha, isAscii, isDigit, isPunctuation, isSpace)
import Data.Either (fromRight)
import Data.Functor (($>))
import Data.List (foldl', intercalate)
@ -32,10 +34,11 @@ import Simplex.Chat.Types
import Simplex.Messaging.Agent.Protocol (AConnectionLink (..), ConnReqUriData (..), ConnShortLink (..), ConnectionLink (..), ConnectionRequestUri (..), ContactConnType (..), SMPQueue (..), simplexConnReqUri, simplexShortLink)
import Simplex.Messaging.Encoding.String
import Simplex.Messaging.Parsers (defaultJSON, dropPrefix, enumJSON, fstToLower, sumTypeJSON)
import Simplex.Messaging.Protocol (ProtocolServer (..))
import Simplex.Messaging.Protocol (ProtocolServer (..), sameSrvAddr)
import Simplex.Messaging.Util (decodeJSON, safeDecodeUtf8)
import System.Console.ANSI.Types
import qualified Text.Email.Validate as Email
import qualified URI.ByteString as U
data Markdown = Markdown (Maybe Format) Text | Markdown :|: Markdown
deriving (Eq, Show)
@ -47,23 +50,30 @@ data Format
| Snippet
| Secret
| Colored {color :: FormatColor}
| WebLink {text :: WLText, scheme :: Text, originalUri :: Text, sanitizedUri :: Text}
| SimplexLink {linkType :: SimplexLinkType, simplexUri :: AConnectionLink, smpHosts :: NonEmpty Text}
-- linkUri is Nothing when there is no link text or it is the same as URI, in which case the text of the fragment is URI.
-- sanitizedUri is Nothing when original URI is already sanitized.
-- spoofed is True when link text is a valid URI, and it is different from link URI.
| WebLink {scheme :: Text, linkUri :: Maybe Text, sanitizedUri :: Maybe Text, spoofed :: Bool}
| SimplexLink {linkType :: SimplexLinkType, simplexUri :: AConnectionLink, smpHosts :: NonEmpty Text, spoofed :: Bool}
| Mention {memberName :: Text}
| Email
| Phone
deriving (Eq, Show)
-- This type provides support for descriptive markdown links, such as [click here](https://example.com),
-- with detection of any potentially malicious links, such as:
-- - tracking parameters (tracked via difference between originalUri and sanitizedUri in WebLink)
-- - WLBadPath: different path in link text from the path in the link, e.g. [example.com/hello](https://example.com/goodbuy), except query string parameters
-- - WLBadHost: different domain in the link text from the domain in the link, e.g [example.com](https://error.com)
-- Sending clients should reject such links, receiving clients should warn before opening links with bad path or host and show them in red.
data WLText
= WLText {linkText :: Maybe Text} -- the text is Nothing if it's the same as the link, it should be used to decide whether to show alert before opening.
| WLBadPath {badLink :: Text}
| WLBadHost {badLink :: Text}
data SimplexOrWebLink = SOWSimplex AConnectionLink | SOWWeb U.URI
parseLinkUri :: Text -> Either String SimplexOrWebLink
parseLinkUri t = case strDecode s of
Right cLink -> Right $ SOWSimplex cLink
Left _ -> case U.parseURI U.laxURIParserOptions s of
Right uri@U.URI {uriAuthority} -> case uriAuthority of
Just U.Authority {authorityHost = U.Host h}
| B.elem '.' h -> Right $ SOWWeb uri
| otherwise -> Left "Invalid URI host"
Nothing -> Left "No URI host"
Left e -> Left $ "Invalid URI: " <> show e
where
s = encodeUtf8 t
mentionedNames :: MarkdownList -> [Text]
mentionedNames = mapMaybe (\(FormattedText f _) -> mentionedName =<< f)
@ -184,6 +194,7 @@ markdownP = mconcat <$> A.many' fragmentP
'#' -> A.char '#' *> secretP
'!' -> coloredP <|> wordP
'@' -> mentionP <|> wordP
'[' -> webLinkP <|> wordP
_
| isDigit c -> phoneP <|> wordP
| otherwise -> wordP
@ -218,6 +229,20 @@ markdownP = mconcat <$> A.many' fragmentP
name <- displayNameTextP
let sName = if c == '\'' then '\'' `T.cons` name `T.snoc` '\'' else name
pure $ markdown (Mention name) ('@' `T.cons` sName)
webLinkP = do
t <- A.char '[' *> A.takeWhile1 (/= ']') <* A.char ']'
uri <- A.char '(' *> A.takeWhile1 (/= ')') <* A.char ')'
sowLink <- either fail pure $ parseLinkUri uri
case sowLink of
SOWSimplex _ -> fail "SimpleX links with link text not supported"
SOWWeb _ -> pure ()
let t' = T.dropAround isPunctuation $ T.filter (not . isSpace) t
(linkUri, spoofed) = case either (\_ -> parseLinkUri ("https://" <> t')) Right $ parseLinkUri t' of
Right _
| t == uri -> (Nothing, False)
| otherwise -> (Just uri, True)
Left _ -> (Just uri, False)
pure $ markdown (sowLinkFormat linkUri spoofed sowLink) t
colorP =
A.anyChar >>= \case
'r' -> "ed" $> Red <|> pure Red
@ -251,23 +276,42 @@ markdownP = mconcat <$> A.many' fragmentP
let t = T.takeWhileEnd isPunctuation' s
uri = uriMarkdown $ T.dropWhileEnd isPunctuation' s
in if T.null t then uri else uri :|: unmarked t
| isDomain s = markdown (WebLink "https" (Just $ "https://" <> s) Nothing False) s
| isEmail s = markdown Email s
| otherwise = unmarked s
isPunctuation' = \case
'/' -> False
')' -> False
c -> isPunctuation c
uriMarkdown s = case strDecode $ encodeUtf8 s of
Right cLink -> markdown (simplexUriFormat cLink) s
_ -> markdown Uri s
uriMarkdown s = case parseLinkUri s of
Right sowLink -> markdown (sowLinkFormat Nothing False sowLink) s
Left _ -> unmarked s
isUri s = T.length s >= 10 && any (`T.isPrefixOf` s) ["http://", "https://", "simplex:/"]
isDomain s = case T.splitOn "." s of
[name, tld] -> validDomain name tld
[sub, name, tld] -> T.length sub > 0 && T.length sub <= 8 && validDomain name tld
_ -> False
where
validDomain name tld =
(let n = T.length name in n >= 1 && n <= 24)
&& (let n = T.length tld in n >= 2 && n <= 8)
&& (let p c = isAscii c && isAlpha c in T.all p name && T.all p tld)
isEmail s = T.any (== '@') s && Email.isValid (encodeUtf8 s)
noFormat = pure . unmarked
simplexUriFormat :: AConnectionLink -> Format
simplexUriFormat = \case
sowLinkFormat :: Maybe Text -> Bool -> SimplexOrWebLink -> Format
sowLinkFormat linkUri spoofed = \case
SOWSimplex cLink -> simplexUriFormat spoofed cLink
SOWWeb uri@U.URI {uriScheme = U.Scheme sch, uriQuery = U.Query originalQS} ->
let sanitizedQS = filter (\(p, _) -> p == "q" || p == "search") originalQS
sanitizedUri
| length sanitizedQS == length originalQS = Nothing
| otherwise = Just $ safeDecodeUtf8 $ U.serializeURIRef' uri {U.uriQuery = U.Query sanitizedQS}
in WebLink {scheme = safeDecodeUtf8 sch, linkUri, sanitizedUri, spoofed}
simplexUriFormat :: Bool -> AConnectionLink -> Format
simplexUriFormat spoofed = \case
ACL m (CLFull cReq) -> case cReq of
CRContactUri crData -> SimplexLink (linkType' crData) cLink $ uriHosts crData
CRInvitationUri crData _ -> SimplexLink XLInvitation cLink $ uriHosts crData
CRContactUri crData -> SimplexLink (linkType' crData) cLink (uriHosts crData) spoofed
CRInvitationUri crData _ -> SimplexLink XLInvitation cLink (uriHosts crData) spoofed
where
cLink = ACL m $ CLFull $ simplexConnReqUri cReq
uriHosts ConnReqUriData {crSmpQueues} = L.map strEncodeText $ sconcat $ L.map (host . qServer) crSmpQueues
@ -275,8 +319,8 @@ markdownP = mconcat <$> A.many' fragmentP
Just (CRDataGroup _) -> XLGroup
Nothing -> XLContact
ACL m (CLShort sLnk) -> case sLnk of
CSLContact _ ct srv _ -> SimplexLink (linkType' ct) cLink $ uriHosts srv
CSLInvitation _ srv _ _ -> SimplexLink XLInvitation cLink $ uriHosts srv
CSLContact _ ct srv _ -> SimplexLink (linkType' ct) cLink (uriHosts srv) spoofed
CSLInvitation _ srv _ _ -> SimplexLink XLInvitation cLink (uriHosts srv) spoofed
where
cLink = ACL m $ CLShort $ simplexShortLink sLnk
uriHosts srv = L.map strEncodeText $ host srv
@ -297,8 +341,11 @@ markdownText (FormattedText f_ t) = case f_ of
Snippet -> around '`'
Secret -> around '#'
Colored (FormatColor c) -> color c
Uri -> t
SimplexLink {} -> t
WebLink {linkUri} -> case linkUri of
Just uri | uri /= t && uri /= ("https://" <> t) ->
"[" <> t <> "](" <> uri <> ")"
_ -> t
SimplexLink {simplexUri} -> t
Mention _ -> t
Email -> t
Phone -> t

View file

@ -14,7 +14,6 @@ import Control.Exception (finally)
import Control.Monad (forM_, when)
import qualified Data.Text as T
import Directory.Captcha
import qualified Directory.Events as DE
import Directory.Options
import Directory.Service
import Directory.Store
@ -22,6 +21,7 @@ import GHC.IO.Handle (hClose)
import Simplex.Chat.Bot.KnownContacts
import Simplex.Chat.Controller (ChatConfig (..), ChatHooks (..), defaultChatHooks)
import Simplex.Chat.Core
import qualified Simplex.Chat.Markdown as Markdown
import Simplex.Chat.Options (CoreChatOpts (..))
import Simplex.Chat.Options.DB
import Simplex.Chat.Types (Profile (..))
@ -111,7 +111,7 @@ serviceDbPrefix :: FilePath
serviceDbPrefix = "directory_service"
viewName :: String -> String
viewName = T.unpack . DE.viewName . T.pack
viewName = T.unpack . Markdown.viewName . T.pack
testDirectoryService :: HasCallStack => TestParams -> IO ()
testDirectoryService ps =

View file

@ -20,6 +20,7 @@ markdownTests = do
secretText
textColor
textWithUri
textWithHyperlink
textWithEmail
textWithPhone
textWithMentions
@ -168,23 +169,35 @@ textColor = describe "text color (red)" do
<==> "snippet: " <> markdown Snippet "this is !1 red text!"
uri :: Text -> Markdown
uri = Markdown $ Just Uri
uri = Markdown $ Just baseUriFormat
httpUri :: Text -> Markdown
httpUri = Markdown $ Just baseUriFormat {scheme = "http"}
weblink :: Text -> Text -> Markdown
weblink u = Markdown $ Just baseUriFormat {linkUri = Just u}
sanitizedWebLink :: Text -> Text -> Markdown
sanitizedWebLink u = Markdown $ Just baseUriFormat {sanitizedUri = Just u}
baseUriFormat :: Format
baseUriFormat = WebLink {scheme = "https", linkUri = Nothing, sanitizedUri = Nothing, spoofed = False}
simplexLink :: SimplexLinkType -> Text -> NonEmpty Text -> Text -> Markdown
simplexLink linkType uriText smpHosts t = Markdown (simplexLinkFormat linkType uriText smpHosts) t
simplexLinkFormat :: SimplexLinkType -> Text -> NonEmpty Text -> Maybe Format
simplexLinkFormat linkType uriText smpHosts = case strDecode $ encodeUtf8 uriText of
Right simplexUri -> Just SimplexLink {linkType, simplexUri, smpHosts}
Right simplexUri -> Just SimplexLink {linkType, simplexUri, smpHosts, spoofed = False}
Left e -> error e
textWithUri :: Spec
textWithUri = describe "text with Uri" do
textWithUri = describe "text with WebLink without link text" do
it "correct markdown" do
"https://simplex.chat" <==> uri "https://simplex.chat"
"https://simplex.chat." <==> uri "https://simplex.chat" <> "."
"https://simplex.chat, hello" <==> uri "https://simplex.chat" <> ", hello"
"http://simplex.chat" <==> uri "http://simplex.chat"
"http://simplex.chat" <==> httpUri "http://simplex.chat"
"this is https://simplex.chat" <==> "this is " <> uri "https://simplex.chat"
"https://simplex.chat site" <==> uri "https://simplex.chat" <> " site"
"SimpleX on GitHub: https://github.com/simplex-chat/" <==> "SimpleX on GitHub: " <> uri "https://github.com/simplex-chat/"
@ -192,10 +205,26 @@ textWithUri = describe "text with Uri" do
"https://github.com/simplex-chat/ - SimpleX on GitHub" <==> uri "https://github.com/simplex-chat/" <> " - SimpleX on GitHub"
-- "SimpleX on GitHub (https://github.com/simplex-chat/)" <==> "SimpleX on GitHub (" <> uri "https://github.com/simplex-chat/" <> ")"
"https://en.m.wikipedia.org/wiki/Servo_(software)" <==> uri "https://en.m.wikipedia.org/wiki/Servo_(software)"
"example.com" <==> weblink "https://example.com" "example.com"
"www.example.com" <==> weblink "https://www.example.com" "www.example.com"
"example.academy" <==> weblink "https://example.academy" "example.academy"
"this is example.com" <==> "this is " <> weblink "https://example.com" "example.com"
"x.com" <==> weblink "https://x.com" "x.com"
"https://example.com/?ref=123" <==> sanitizedWebLink "https://example.com/" "https://example.com/?ref=123"
"https://example.com/?ref=123#anchor" <==> sanitizedWebLink "https://example.com/#anchor" "https://example.com/?ref=123#anchor"
"https://example.com/#anchor?ref=123" <==> uri "https://example.com/#anchor?ref=123"
"https://duckduckgo.com/?q=search+string" <==> uri "https://duckduckgo.com/?q=search+string"
"https://duckduckgo.com/?t=h_&q=search+string&ia=web" <==> sanitizedWebLink "https://duckduckgo.com/?q=search%20string" "https://duckduckgo.com/?t=h_&q=search+string&ia=web"
it "ignored as markdown" do
"_https://simplex.chat" <==> "_https://simplex.chat"
"this is _https://simplex.chat" <==> "this is _https://simplex.chat"
"this is https://" <==> "this is https://"
"example.c" <==> "example.c"
"www.www.example.com" <==> "www.www.example.com"
"www.example1.com" <==> "www.example1.com"
"www." <==> "www."
".com" <==> ".com"
"example.academytoolong" <==> "example.academytoolong"
it "SimpleX links" do
let inv = "/invitation#/?v=1&smp=smp%3A%2F%2F1234-w%3D%3D%40smp.simplex.im%3A5223%2F3456-w%3D%3D%23%2F%3Fv%3D1-2%26dh%3DMCowBQYDK2VuAyEAjiswwI3O_NlS8Fk3HJUW870EY2bAwmttMBsvRB9eV3o%253D&e2e=v%3D2%26x3dh%3DMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D%2CMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D"
("https://simplex.chat" <> inv) <==> simplexLink XLInvitation ("simplex:" <> inv) ["smp.simplex.im"] ("https://simplex.chat" <> inv)
@ -208,6 +237,25 @@ textWithUri = describe "text with Uri" do
("https://simplex.chat" <> gr) <==> simplexLink XLGroup ("simplex:" <> gr) ["smp4.simplex.im", "o5vmywmrnaxalvz6wi3zicyftgio6psuvyniis6gco6bp6ekl4cqj4id.onion"] ("https://simplex.chat" <> gr)
("simplex:" <> gr) <==> simplexLink XLGroup ("simplex:" <> gr) ["smp4.simplex.im", "o5vmywmrnaxalvz6wi3zicyftgio6psuvyniis6gco6bp6ekl4cqj4id.onion"] ("simplex:" <> gr)
spoofedLink :: Text -> Text -> Markdown
spoofedLink u = Markdown $ Just baseUriFormat {linkUri = Just u, spoofed = True}
textWithHyperlink :: Spec
textWithHyperlink = describe "text with WebLink without link text" do
it "correct markdown" do
"[click here](https://example.com)" <==> weblink "https://example.com" "click here"
"For details [click here](https://example.com)" <==> "For details " <> weblink "https://example.com" "click here"
it "spoofed link" do
"[https://example.com](https://another.com)" <==> spoofedLink "https://another.com" "https://example.com"
"[example.com/page](https://another.com/page)" <==> spoofedLink "https://another.com/page" "example.com/page"
"[ example.com/page!. ](https://another.com/page)" <==> spoofedLink "https://another.com/page" " example.com/page!. "
"[ example . com/page !. ](https://another.com/page)" <==> spoofedLink "https://another.com/page" " example . com/page !. "
it "ignored as markdown" do
"[click here](example.com)" <==> "[click here](example.com)"
"[click here](https://example.com )" <==> "[click here](https://example.com )"
let inv = "/invitation#/?v=1&smp=smp%3A%2F%2F1234-w%3D%3D%40smp.simplex.im%3A5223%2F3456-w%3D%3D%23%2F%3Fv%3D1-2%26dh%3DMCowBQYDK2VuAyEAjiswwI3O_NlS8Fk3HJUW870EY2bAwmttMBsvRB9eV3o%253D&e2e=v%3D2%26x3dh%3DMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D%2CMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D"
("[Connect to me](https://simplex.chat" <> inv <> ")") <==> ("[Connect to me](https://simplex.chat" <> unmarked inv <> ")")
email :: Text -> Markdown
email = Markdown $ Just Email
@ -221,11 +269,11 @@ textWithEmail = describe "text with Email" do
"chat@simplex.chat test" <==> email "chat@simplex.chat" <> " test"
"test1 chat@simplex.chat test2" <==> "test1 " <> email "chat@simplex.chat" <> " test2"
it "ignored as markdown" do
"chat @simplex.chat" <==> "chat " <> mention "simplex.chat" "@simplex.chat"
"this is chat @simplex.chat" <==> "this is chat " <> mention "simplex.chat" "@simplex.chat"
"this is chat@ simplex.chat" <==> "this is chat@ simplex.chat"
"this is chat @ simplex.chat" <==> "this is chat @ simplex.chat"
"*this* is chat @ simplex.chat" <==> bold "this" <> " is chat @ simplex.chat"
"chat @'simplex.chat'" <==> "chat " <> mention "simplex.chat" "@'simplex.chat'"
"this is chat @'simplex.chat'" <==> "this is chat " <> mention "simplex.chat" "@'simplex.chat'"
"this is chat@ simplex.chat" <==> "this is chat@ " <> weblink "https://simplex.chat" "simplex.chat"
"this is chat @ simplex.chat" <==> "this is chat @ " <> weblink "https://simplex.chat" "simplex.chat"
"*this* is chat @ simplex.chat" <==> bold "this" <> " is chat @ " <> weblink "https://simplex.chat" "simplex.chat"
phone :: Text -> Markdown
phone = Markdown $ Just Phone
@ -268,14 +316,17 @@ textWithMentions = describe "text with mentions" do
"hello @bob @" <==> "hello " <> mention "bob" "@bob" <> " @"
uri' :: Text -> FormattedText
uri' = FormattedText $ Just Uri
uri' = FormattedText $ Just baseUriFormat
httpUri' :: Text -> FormattedText
httpUri' = FormattedText $ Just baseUriFormat {scheme = "http"}
multilineMarkdownList :: Spec
multilineMarkdownList = describe "multiline markdown" do
it "correct markdown" do
"http://simplex.chat\nhttp://app.simplex.chat" <<==>> [uri' "http://simplex.chat", "\n", uri' "http://app.simplex.chat"]
"http://simplex.chat\nhttp://app.simplex.chat" <<==>> [httpUri' "http://simplex.chat", "\n", httpUri' "http://app.simplex.chat"]
it "combines the same formats" do
"http://simplex.chat\ntext 1\ntext 2\nhttp://app.simplex.chat" <<==>> [uri' "http://simplex.chat", "\ntext 1\ntext 2\n", uri' "http://app.simplex.chat"]
"http://simplex.chat\ntext 1\ntext 2\nhttp://app.simplex.chat" <<==>> [httpUri' "http://simplex.chat", "\ntext 1\ntext 2\n", httpUri' "http://app.simplex.chat"]
it "no markdown" do
parseMaybeMarkdownList "not a\nmarkdown" `shouldBe` Nothing
let inv = "/invitation#/?v=1&smp=smp%3A%2F%2F1234-w%3D%3D%40smp.simplex.im%3A5223%2F3456-w%3D%3D%23%2F%3Fv%3D1-2%26dh%3DMCowBQYDK2VuAyEAjiswwI3O_NlS8Fk3HJUW870EY2bAwmttMBsvRB9eV3o%253D&e2e=v%3D2%26x3dh%3DMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D%2CMEIwBQYDK2VvAzkAmKuSYeQ_m0SixPDS8Wq8VBaTS1cW-Lp0n0h4Diu-kUpR-qXx4SDJ32YGEFoGFGSbGPry5Ychr6U%3D"

View file

@ -49,7 +49,7 @@ main = do
describe "Schema dump" schemaDumpTest
around tmpBracket $ describe "WebRTC encryption" webRTCTests
#endif
describe "SimpleX chat markdown" markdownTests
fdescribe "SimpleX chat markdown" markdownTests
describe "JSON Tests" jsonTests
describe "SimpleX chat view" viewTests
describe "SimpleX chat protocol" protocolTests