Fixed bug in parallelized endpoint

7dadb6b5 · Jaime Collado · 085369f4 · 26feabea · 7dadb6b5
Commit 7dadb6b5 authored Nov 09, 2023 by Jaime Collado
Showing with 53 additions and 24 deletions
app/ScraperNoticias
app/api.py
--- a/ScraperNoticias @ 26feabea
+++ b/ScraperNoticias @ 26feabea
-Subproject commit 95c75c27f876ccc49da32a0fceddfcaee625ea46
+Subproject commit 26feabea82aef9bf695c168ccc9da90e0ade023e
--- a/app/api.py
+++ b/app/api.py
-from multiprocessing import Pool
+import time
+
+import multiprocessing as mp

 from fastapi import FastAPI, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
@@ -6,9 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
 import schemas
 from ScraperNoticias.globalScraper import GlobalScraper

-import time
-
-
 # APP
 app = FastAPI()
 #app = FastAPI(openapi_url=None) # Disable interactive docs
@@ -25,15 +24,14 @@ app.add_middleware(

 # ---------- UTILS ----------
 def test_scraper(url):
-    scraped_comments = scrap(url)
-    if scraped_comments:
-        return True
-    return False
+    return True if scrap(url) else False

 def scrap(url):
+    print(f"Process {mp.current_process().name} started scraping {url}")
    scraper = GlobalScraper(url)
    df = scraper.process()
    scraped_comments = df.comments.tolist()
+    print(f"Process {mp.current_process().name} ended scraping {url}")
    return scraped_comments


@@ -79,10 +77,6 @@ async def scrap_url(
        
    Returns:
        A list containing the scraped comments."""
-    
-    print("Antes del sleep")
-    time.sleep(10)
-    print("Después del sleep")

    scraped_comments = scrap(url.url)
    if not scraped_comments:
@@ -92,42 +86,77 @@ async def scrap_url(
        )
    return {"comments": scraped_comments}

-@app.get("/test-scraper", response_model=schemas.ScrapersTested, tags=["Scraper"])
+@app.get("/test-scrapers", response_model=schemas.ScrapersTested, tags=["Scraper"])
 async def test_scrapers():
    """Tests whether the scrapers work or not.

    Returns:
        A dictionary with the name of the newspaper as key and a boolean as value.
    """
+    start = time.time()

    newspapers = {
        "elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
        "okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
        "elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
-        "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
+        # "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
        "elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
        "marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
        "abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
        "elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
        "ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
-        "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
+        # "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
        "elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
        "vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
        "eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
        "alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
    }

-    test_result = {k: False for k, _ in newspapers.items()}
+    mp.set_start_method("spawn") # Fix for FastAPI process shutting down when all processes end. Seen here: https://github.com/tiangolo/fastapi/issues/1487
+    with mp.Pool(processes=8) as pool:
+        test_results = pool.map(test_scraper, newspapers.values())
+       
+    scrapers_test = {newspaper: test_result for newspaper, test_result in zip(newspapers.keys(), test_results)}
+
+    end = time.time()
+    print("Tiempo de ejecución:", end - start)
+
+    return {"scrapers": scrapers_test}

-    pool = Pool(processes=14)
+
+@app.get("/test-scrapers-deprecated", response_model=schemas.ScrapersTested, tags=["Scraper"])
+async def test_scrapers_deprecated():
+    """Tests whether the scrapers work or not.
+
+    Returns:
+        A dictionary with the name of the newspaper as key and a boolean as value.
+    """
+    start = time.time()
+
+    newspapers = {
+        "elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
+        "okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
+        "elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
+        # "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
+        "elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
+        "marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
+        "abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
+        "elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
+        "ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
+        # "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
+        "elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
+        "vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
+        "eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
+        "alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
+    }
+
+    test_result = {k: False for k, _ in newspapers.items()}
    
    for newspaper, url in newspapers.items():
-        test_result[newspaper] = pool.apply_async(test_scraper, [url])        
-        
-    pool.close()
-    pool.join()
+        test_result[newspaper] = test_scraper(url)        

-    final = {k: v.get() for k, v in test_result.items()}
+    end = time.time()
+    print("Tiempo de ejecución:", end - start)

-    return {"scrapers": final}
+    return {"scrapers": test_result}