Commit eacdd3be by Jaime Collado

Scraper testing endpoint working

parent 7b90ae02
Showing with 47 additions and 7 deletions
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
from . import schemas
from .ScraperNoticias.globalScraper import GlobalScraper
import schemas
from ScraperNoticias.globalScraper import GlobalScraper
import time
......@@ -75,4 +75,40 @@ async def scrap_url(
status_code=status.HTTP_404_NOT_FOUND,
detail="Comments not found"
)
return {"comments": scraped_comments}
\ No newline at end of file
return {"comments": scraped_comments}
@app.get("/test-scraper", response_model=schemas.ScrapersTested, tags=["Scraper"])
async def test_scraper():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
newspapers = {
"elpais": "https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html",
"okdiario": "https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826",
"elmundo": "https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html",
"20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial": "https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/",
"marca": "https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html",
"abc": "https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https%3A%2F%2Fwww.abc.es%2F",
"elespanol": "https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html",
"ntvespana": "https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/",
"theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital": "https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/",
"vozpopuli": "https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html",
"eldebate": "https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html",
"alertadigital": "https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result = {k: False for k, _ in newspapers.items()}
for newspaper, url in newspapers.items():
scraper = GlobalScraper(url)
df = scraper.process()
scraped_comments = df.comments.tolist()
if scraped_comments:
test_result[newspaper] = True
return {"scrapers": test_result}
\ No newline at end of file
import uvicorn
import argparse
from .api import app
from api import app
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......
......@@ -15,4 +15,8 @@ class InputURL(BaseModel):
class ScrapedComments(BaseModel):
"""Schema to define the output structure of the scraped comments."""
comments: list[str]
\ No newline at end of file
comments: list[str]
class ScrapersTested(BaseModel):
"""Schema to define the output structure of the tested scrapers."""
scrapers: dict[str, bool]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment