Commit 7dadb6b5 by Jaime Collado

Fixed bug in parallelized endpoint

parent 085369f4
Showing with 53 additions and 24 deletions
ScraperNoticias @ 26feabea
Subproject commit 95c75c27f876ccc49da32a0fceddfcaee625ea46 Subproject commit 26feabea82aef9bf695c168ccc9da90e0ade023e
from multiprocessing import Pool import time
import multiprocessing as mp
from fastapi import FastAPI, HTTPException, status from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
...@@ -6,9 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware ...@@ -6,9 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
import schemas import schemas
from ScraperNoticias.globalScraper import GlobalScraper from ScraperNoticias.globalScraper import GlobalScraper
import time
# APP # APP
app = FastAPI() app = FastAPI()
#app = FastAPI(openapi_url=None) # Disable interactive docs #app = FastAPI(openapi_url=None) # Disable interactive docs
...@@ -25,15 +24,14 @@ app.add_middleware( ...@@ -25,15 +24,14 @@ app.add_middleware(
# ---------- UTILS ---------- # ---------- UTILS ----------
def test_scraper(url): def test_scraper(url):
scraped_comments = scrap(url) return True if scrap(url) else False
if scraped_comments:
return True
return False
def scrap(url): def scrap(url):
print(f"Process {mp.current_process().name} started scraping {url}")
scraper = GlobalScraper(url) scraper = GlobalScraper(url)
df = scraper.process() df = scraper.process()
scraped_comments = df.comments.tolist() scraped_comments = df.comments.tolist()
print(f"Process {mp.current_process().name} ended scraping {url}")
return scraped_comments return scraped_comments
...@@ -79,10 +77,6 @@ async def scrap_url( ...@@ -79,10 +77,6 @@ async def scrap_url(
Returns: Returns:
A list containing the scraped comments.""" A list containing the scraped comments."""
print("Antes del sleep")
time.sleep(10)
print("Después del sleep")
scraped_comments = scrap(url.url) scraped_comments = scrap(url.url)
if not scraped_comments: if not scraped_comments:
...@@ -92,42 +86,77 @@ async def scrap_url( ...@@ -92,42 +86,77 @@ async def scrap_url(
) )
return {"comments": scraped_comments} return {"comments": scraped_comments}
@app.get("/test-scraper", response_model=schemas.ScrapersTested, tags=["Scraper"]) @app.get("/test-scrapers", response_model=schemas.ScrapersTested, tags=["Scraper"])
async def test_scrapers(): async def test_scrapers():
"""Tests whether the scrapers work or not. """Tests whether the scrapers work or not.
Returns: Returns:
A dictionary with the name of the newspaper as key and a boolean as value. A dictionary with the name of the newspaper as key and a boolean as value.
""" """
start = time.time()
newspapers = { newspapers = {
"elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html", "elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
"okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826", "okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
"elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html", "elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
"20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/", # "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/", "elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
"marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html", "marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
"abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F", "abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
"elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html", "elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
"ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/", "ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
"theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/", # "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/", "elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
"vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html", "vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
"eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html", "eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
"alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/" "alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
} }
test_result = {k: False for k, _ in newspapers.items()} mp.set_start_method("spawn") # Fix for FastAPI process shutting down when all processes end. Seen here: https://github.com/tiangolo/fastapi/issues/1487
with mp.Pool(processes=8) as pool:
test_results = pool.map(test_scraper, newspapers.values())
scrapers_test = {newspaper: test_result for newspaper, test_result in zip(newspapers.keys(), test_results)}
end = time.time()
print("Tiempo de ejecución:", end - start)
return {"scrapers": scrapers_test}
pool = Pool(processes=14)
@app.get("/test-scrapers-deprecated", response_model=schemas.ScrapersTested, tags=["Scraper"])
async def test_scrapers_deprecated():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
start = time.time()
newspapers = {
"elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
"okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
"elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
# "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
"marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
"abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
"elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
"ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
# "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
"vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
"eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
"alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result = {k: False for k, _ in newspapers.items()}
for newspaper, url in newspapers.items(): for newspaper, url in newspapers.items():
test_result[newspaper] = pool.apply_async(test_scraper, [url]) test_result[newspaper] = test_scraper(url)
pool.close()
pool.join()
final = {k: v.get() for k, v in test_result.items()} end = time.time()
print("Tiempo de ejecución:", end - start)
return {"scrapers": final} return {"scrapers": test_result}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment