Reaktor url-title: fix some issues with weird urls
ref: https://irc-bot-science.clsr.net/
This commit is contained in:
parent
70e8c4b0a4
commit
92aa5bb232
@ -121,21 +121,27 @@ rec {
|
|||||||
pattern = "^.*(?P<args>http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+).*$$";
|
pattern = "^.*(?P<args>http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+).*$$";
|
||||||
path = with pkgs; [ curl perl ];
|
path = with pkgs; [ curl perl ];
|
||||||
script = pkgs.writePython3 "url-title" [ "beautifulsoup4" "lxml" ] ''
|
script = pkgs.writePython3 "url-title" [ "beautifulsoup4" "lxml" ] ''
|
||||||
|
import cgi
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(urllib.request.urlopen(sys.argv[1]), "lxml")
|
resp = urllib.request.urlopen(sys.argv[1])
|
||||||
|
if resp.headers['content-type'].find('text/html') >= 0:
|
||||||
|
soup = BeautifulSoup(resp.read(16000), "lxml")
|
||||||
title = soup.find('title').string
|
title = soup.find('title').string
|
||||||
|
|
||||||
if title:
|
if title:
|
||||||
if len(title) > 512:
|
if len(title) > 450:
|
||||||
print('message to long, skipped')
|
print('message to long, rest skipped')
|
||||||
elif len(title.split('\n')) > 5:
|
elif len(title.split('\n')) > 5:
|
||||||
print('to many lines, skipped')
|
print('to many lines, skipped')
|
||||||
else:
|
else:
|
||||||
print(title)
|
print(title)
|
||||||
|
else:
|
||||||
|
cd_header = resp.headers['content-disposition']
|
||||||
|
print(cgi.parse_header(cd_header)[1]['filename'])
|
||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
pass
|
pass
|
||||||
'';
|
'';
|
||||||
|
Loading…
Reference in New Issue
Block a user