Scraper method included

f581edb0 · Jaime Collado · 15168362 · f581edb0 · f581edb0 · f581edb0
Commit f581edb0 authored Jun 22, 2023 by Jaime Collado
Showing with 79 additions and 4 deletions
.gitignore
api.py
requirements.txt
schemas.py
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,6 @@ docs/_build/
 # Environments
 venv/
+# Scraper
+ScraperNoticias/
--- a/api.py
+++ b/api.py
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
 import schemas
+from ScraperNoticias.globalScraper import GlobalScraper
 # APP
 app = FastAPI()
 #app = FastAPI(openapi_url=None) # Disable interactive docs
@@ -46,3 +48,24 @@ async def predict_toxicity(
    # TODO: Aquí invocamos al modelo de toxicidad y devolvemos la predicción.
    return {"prediction": 0.0}
+@app.post("/scrap", response_model=schemas.ScrapedComments, tags=["Scraper"])
+async def scrap_url(
+    url: schemas.InputURL
+):
+    """Scraps comments from a given URL.
+    Args:
+        url: URL containing news.
+    Returns:
+        A list containing the scraped comments."""
+    scraper = GlobalScraper(url.url)
+    df = scraper.process()
+    scraped_comments = df.comments.tolist()
+    if not scraped_comments:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND, 
+            detail="Comments not found"
+        )
+    return {"comments": scraped_comments}
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
 anyio==3.6.2
+async-generator==1.10
+attrs==22.2.0
 bcrypt==4.0.1
+beautifulsoup4==4.11.2
 certifi==2022.12.7
 cffi==1.15.1
+charset-normalizer==3.1.0
 click==8.1.3
 cryptography==39.0.1
+cssselect==1.2.0
 dnspython==2.3.0
+docutils==0.20.1
 ecdsa==0.18.0
 email-validator==1.3.1
+exceptiongroup==1.1.1
 fastapi==0.89.1
+feedfinder2==0.0.4
+feedparser==6.0.10
+filelock==3.12.0
 greenlet==2.0.2
 h11==0.14.0
 httpcore==0.16.3
@@ -15,32 +25,62 @@ httptools==0.5.0
 httpx==0.23.3
 idna==3.4
 itsdangerous==2.1.2
+jieba3k==0.35.1
 Jinja2==3.1.2
 joblib==1.2.0
+lockfile==0.12.2
+lxml==4.9.2
 MarkupSafe==2.1.2
+newspaper3k==0.2.8
+nltk==3.8.1
 numpy==1.24.2
 orjson==3.8.5
+outcome==1.2.0
+pandas==2.0.2
 passlib==1.7.4
+pid==3.0.4
+Pillow==9.5.0
 pyasn1==0.4.8
 pycparser==2.21
 pydantic==1.10.4
+PySocks==1.7.1
+python-daemon==3.0.1
+python-dateutil==2.8.2
 python-dotenv==0.21.1
 python-jose==3.3.0
 python-multipart==0.0.5
+pytz==2023.3
 PyYAML==6.0
+regex==2023.5.5
+requests==2.28.2
+requests-file==1.5.1
 rfc3986==1.5.0
 rsa==4.9
 scikit-learn==1.0.2
 scipy==1.10.0
+selenium==4.8.2
+service==0.6.0
+setproctitle==1.3.2
+sgmllib3k==1.0.0
 six==1.16.0
 sniffio==1.3.0
+sortedcontainers==2.4.0
+soupsieve==2.4
 SQLAlchemy==2.0.2
 starlette==0.22.0
 threadpoolctl==3.1.0
+tinysegmenter==0.3
+tldextract==3.4.4
+tqdm==4.65.0
+trio==0.22.0
+trio-websocket==0.10.2
 typing_extensions==4.4.0
+tzdata==2023.3
 ujson==5.7.0
+urllib3==1.26.15
 uvicorn==0.20.0
 uvloop==0.17.0
 watchfiles==0.18.1
 websockets==10.4
+wsproto==1.2.0
 xgboost==0.90
--- a/schemas.py
+++ b/schemas.py
-from pydantic import BaseModel
+from pydantic import BaseModel, HttpUrl
 # ---------- DATA SCHEMAS ----------
 class InputText(BaseModel):
-    """Schema to define the input structure of the data."""
+    """Schema to define the input structure of the data to predict."""
    text: str
 class OutputPrediction(BaseModel):
-    """Schema to define the output's structure."""
+    """Schema to define the output predictions' structure."""
    prediction: float
+class InputURL(BaseModel):
+    url: HttpUrl
+class ScrapedComments(BaseModel):
+    """Schema to define the output structure of the scraped comments."""
+    comments: list[str]
\ No newline at end of file