Commit 7dadb6b5 by Jaime Collado

Fixed bug in parallelized endpoint

parent 085369f4
Showing with 53 additions and 24 deletions
ScraperNoticias @ 26feabea
Subproject commit 95c75c27f876ccc49da32a0fceddfcaee625ea46
Subproject commit 26feabea82aef9bf695c168ccc9da90e0ade023e
from multiprocessing import Pool
import time
import multiprocessing as mp
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
......@@ -6,9 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
import schemas
from ScraperNoticias.globalScraper import GlobalScraper
import time
# APP
app = FastAPI()
#app = FastAPI(openapi_url=None) # Disable interactive docs
......@@ -25,15 +24,14 @@ app.add_middleware(
# ---------- UTILS ----------
def test_scraper(url):
scraped_comments = scrap(url)
if scraped_comments:
return True
return False
return True if scrap(url) else False
def scrap(url):
print(f"Process {mp.current_process().name} started scraping {url}")
scraper = GlobalScraper(url)
df = scraper.process()
scraped_comments = df.comments.tolist()
print(f"Process {mp.current_process().name} ended scraping {url}")
return scraped_comments
......@@ -79,10 +77,6 @@ async def scrap_url(
Returns:
A list containing the scraped comments."""
print("Antes del sleep")
time.sleep(10)
print("Después del sleep")
scraped_comments = scrap(url.url)
if not scraped_comments:
......@@ -92,42 +86,77 @@ async def scrap_url(
)
return {"comments": scraped_comments}
@app.get("/test-scraper", response_model=schemas.ScrapersTested, tags=["Scraper"])
@app.get("/test-scrapers", response_model=schemas.ScrapersTested, tags=["Scraper"])
async def test_scrapers():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
start = time.time()
newspapers = {
"elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
"okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
"elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
"20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
# "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
"marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
"abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
"elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
"ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
"theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
# "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
"vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
"eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
"alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result = {k: False for k, _ in newspapers.items()}
mp.set_start_method("spawn") # Fix for FastAPI process shutting down when all processes end. Seen here: https://github.com/tiangolo/fastapi/issues/1487
with mp.Pool(processes=8) as pool:
test_results = pool.map(test_scraper, newspapers.values())
scrapers_test = {newspaper: test_result for newspaper, test_result in zip(newspapers.keys(), test_results)}
end = time.time()
print("Tiempo de ejecución:", end - start)
return {"scrapers": scrapers_test}
pool = Pool(processes=14)
@app.get("/test-scrapers-deprecated", response_model=schemas.ScrapersTested, tags=["Scraper"])
async def test_scrapers_deprecated():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
start = time.time()
newspapers = {
"elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
"okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
"elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
# "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
"marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
"abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
"elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
"ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
# "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
"vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
"eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
"alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result = {k: False for k, _ in newspapers.items()}
for newspaper, url in newspapers.items():
test_result[newspaper] = pool.apply_async(test_scraper, [url])
pool.close()
pool.join()
test_result[newspaper] = test_scraper(url)
final = {k: v.get() for k, v in test_result.items()}
end = time.time()
print("Tiempo de ejecución:", end - start)
return {"scrapers": final}
return {"scrapers": test_result}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment