un peu de refactoring pour les extracteurs

This commit is contained in:
Jean-Marie Favreau 2023-11-26 09:53:01 +01:00
parent 4999b47833
commit 794bed6b74

View File

@ -26,10 +26,20 @@ logger = get_task_logger(__name__)
class Extractor:
name = None
@abstractmethod
def is_known_url(url):
pass
@abstractmethod
def extract(url):
pass
@abstractmethod
def clean_url(url):
pass
def download(url):
try:
options = Options()
@ -69,6 +79,8 @@ class Extractor:
class ExtractorFacebook(Extractor):
name = "Facebook"
class SimpleFacebookEvent:
def __init__(self, data):
@ -194,8 +206,7 @@ class ExtractorFacebook(Extractor):
image = self.get_element("image")
local_image = None if image is None else Extractor.download_media(image)
u = urlparse(url)
unique_url = u.scheme + "://" + u.netloc + u.path
return Event(title=self.get_element("name"),
status=Event.STATUS.DRAFT,
@ -208,7 +219,20 @@ class ExtractorFacebook(Extractor):
local_image=local_image,
image=self.get_element("image"),
image_alt=self.get_element("image_alt"),
reference_urls=[unique_url])
reference_urls=[url])
def clean_url(url):
if ExtractorFacebook.is_known_url(url):
u = urlparse(url)
return u.scheme + "://" + u.netloc + u.path
else:
return url
def is_known_url(url):
u = urlparse(url)
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
def process_page(txt, url):
@ -230,6 +254,13 @@ class ExtractorFacebook(Extractor):
class ExtractorAllURLs:
extractors = [ExtractorFacebook]
def clean_url(url):
result = url
for e in ExtractorAllURLs.extractors:
result = e.clean_url(result)
return result
def extract(url):
logger.info("Run extraction")
@ -239,13 +270,12 @@ class ExtractorAllURLs:
logger.info("Cannot download url")
return None
result = ExtractorFacebook.process_page(txt, url)
for e in ExtractorAllURLs.extractors:
result = e.process_page(txt, url)
if result is not None:
return result
else:
logger.info("Not a Facebook link")
# TODO: add here other extrators
if result is not None:
return result
else:
logger.info("Not a " + e.name + " link")
return None