un peu de refactoring pour les extracteurs
This commit is contained in:
parent
4999b47833
commit
794bed6b74
@ -26,10 +26,20 @@ logger = get_task_logger(__name__)
|
||||
|
||||
class Extractor:
|
||||
|
||||
name = None
|
||||
|
||||
@abstractmethod
|
||||
def is_known_url(url):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def extract(url):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean_url(url):
|
||||
pass
|
||||
|
||||
def download(url):
|
||||
try:
|
||||
options = Options()
|
||||
@ -69,6 +79,8 @@ class Extractor:
|
||||
|
||||
class ExtractorFacebook(Extractor):
|
||||
|
||||
name = "Facebook"
|
||||
|
||||
class SimpleFacebookEvent:
|
||||
|
||||
def __init__(self, data):
|
||||
@ -194,8 +206,7 @@ class ExtractorFacebook(Extractor):
|
||||
image = self.get_element("image")
|
||||
local_image = None if image is None else Extractor.download_media(image)
|
||||
|
||||
u = urlparse(url)
|
||||
unique_url = u.scheme + "://" + u.netloc + u.path
|
||||
|
||||
|
||||
return Event(title=self.get_element("name"),
|
||||
status=Event.STATUS.DRAFT,
|
||||
@ -208,7 +219,20 @@ class ExtractorFacebook(Extractor):
|
||||
local_image=local_image,
|
||||
image=self.get_element("image"),
|
||||
image_alt=self.get_element("image_alt"),
|
||||
reference_urls=[unique_url])
|
||||
reference_urls=[url])
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
|
||||
if ExtractorFacebook.is_known_url(url):
|
||||
u = urlparse(url)
|
||||
return u.scheme + "://" + u.netloc + u.path
|
||||
else:
|
||||
return url
|
||||
|
||||
def is_known_url(url):
|
||||
u = urlparse(url)
|
||||
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
||||
|
||||
|
||||
def process_page(txt, url):
|
||||
@ -230,6 +254,13 @@ class ExtractorFacebook(Extractor):
|
||||
|
||||
class ExtractorAllURLs:
|
||||
|
||||
extractors = [ExtractorFacebook]
|
||||
|
||||
def clean_url(url):
|
||||
result = url
|
||||
for e in ExtractorAllURLs.extractors:
|
||||
result = e.clean_url(result)
|
||||
return result
|
||||
|
||||
def extract(url):
|
||||
logger.info("Run extraction")
|
||||
@ -239,13 +270,12 @@ class ExtractorAllURLs:
|
||||
logger.info("Cannot download url")
|
||||
return None
|
||||
|
||||
result = ExtractorFacebook.process_page(txt, url)
|
||||
for e in ExtractorAllURLs.extractors:
|
||||
result = e.process_page(txt, url)
|
||||
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
logger.info("Not a Facebook link")
|
||||
|
||||
# TODO: add here other extrators
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
logger.info("Not a " + e.name + " link")
|
||||
|
||||
return None
|
||||
|
Loading…
x
Reference in New Issue
Block a user