Scraper testing endpoint working

eacdd3be · Jaime Collado · 7b90ae02 · eacdd3be · eacdd3be · eacdd3be
Commit eacdd3be authored Nov 08, 2023 by Jaime Collado
Showing with 47 additions and 7 deletions
app/api.py
app/main.py
app/schemas.py
--- a/app/api.py
+++ b/app/api.py
 from fastapi import FastAPI, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware

-from . import schemas
-from .ScraperNoticias.globalScraper import GlobalScraper
+import schemas
+from ScraperNoticias.globalScraper import GlobalScraper

 import time

@@ -75,4 +75,40 @@ async def scrap_url(
            status_code=status.HTTP_404_NOT_FOUND, 
            detail="Comments not found"
        )
-    return {"comments": scraped_comments}
\ No newline at end of file
+    return {"comments": scraped_comments}
+
+@app.get("/test-scraper", response_model=schemas.ScrapersTested, tags=["Scraper"])
+async def test_scraper():
+    """Tests whether the scrapers work or not.
+
+    Returns:
+        A dictionary with the name of the newspaper as key and a boolean as value.
+    """
+
+    newspapers = {
+        "elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
+        "okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
+        "elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
+        "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
+        "elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
+        "marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
+        "abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
+        "elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
+        "ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
+        "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
+        "elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
+        "vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
+        "eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
+        "alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
+    }
+
+    test_result = {k: False for k, _ in newspapers.items()}
+
+    for newspaper, url in newspapers.items():
+        scraper = GlobalScraper(url)
+        df = scraper.process()
+        scraped_comments = df.comments.tolist()
+        if scraped_comments:
+            test_result[newspaper] = True
+
+    return {"scrapers": test_result}
\ No newline at end of file
--- a/app/main.py
+++ b/app/main.py
 import uvicorn
 import argparse
-from .api import app
+from api import app

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()

--- a/app/schemas.py
+++ b/app/schemas.py
@@ -15,4 +15,8 @@ class InputURL(BaseModel):

 class ScrapedComments(BaseModel):
    """Schema to define the output structure of the scraped comments."""
-    comments: list[str]
\ No newline at end of file
+    comments: list[str]
+
+class ScrapersTested(BaseModel):
+    """Schema to define the output structure of the tested scrapers."""
+    scrapers: dict[str, bool]
\ No newline at end of file