Commit f581edb0 by Jaime Collado

Scraper method included

parent 15168362
...@@ -51,3 +51,6 @@ docs/_build/ ...@@ -51,3 +51,6 @@ docs/_build/
# Environments # Environments
venv/ venv/
# Scraper
ScraperNoticias/
from fastapi import FastAPI from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import schemas import schemas
from ScraperNoticias.globalScraper import GlobalScraper
# APP # APP
app = FastAPI() app = FastAPI()
#app = FastAPI(openapi_url=None) # Disable interactive docs #app = FastAPI(openapi_url=None) # Disable interactive docs
...@@ -46,3 +48,24 @@ async def predict_toxicity( ...@@ -46,3 +48,24 @@ async def predict_toxicity(
# TODO: Aquí invocamos al modelo de toxicidad y devolvemos la predicción. # TODO: Aquí invocamos al modelo de toxicidad y devolvemos la predicción.
return {"prediction": 0.0} return {"prediction": 0.0}
@app.post("/scrap", response_model=schemas.ScrapedComments, tags=["Scraper"])
async def scrap_url(
url: schemas.InputURL
):
"""Scraps comments from a given URL.
Args:
url: URL containing news.
Returns:
A list containing the scraped comments."""
scraper = GlobalScraper(url.url)
df = scraper.process()
scraped_comments = df.comments.tolist()
if not scraped_comments:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Comments not found"
)
return {"comments": scraped_comments}
\ No newline at end of file
anyio==3.6.2 anyio==3.6.2
async-generator==1.10
attrs==22.2.0
bcrypt==4.0.1 bcrypt==4.0.1
beautifulsoup4==4.11.2
certifi==2022.12.7 certifi==2022.12.7
cffi==1.15.1 cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.3 click==8.1.3
cryptography==39.0.1 cryptography==39.0.1
cssselect==1.2.0
dnspython==2.3.0 dnspython==2.3.0
docutils==0.20.1
ecdsa==0.18.0 ecdsa==0.18.0
email-validator==1.3.1 email-validator==1.3.1
exceptiongroup==1.1.1
fastapi==0.89.1 fastapi==0.89.1
feedfinder2==0.0.4
feedparser==6.0.10
filelock==3.12.0
greenlet==2.0.2 greenlet==2.0.2
h11==0.14.0 h11==0.14.0
httpcore==0.16.3 httpcore==0.16.3
...@@ -15,32 +25,62 @@ httptools==0.5.0 ...@@ -15,32 +25,62 @@ httptools==0.5.0
httpx==0.23.3 httpx==0.23.3
idna==3.4 idna==3.4
itsdangerous==2.1.2 itsdangerous==2.1.2
jieba3k==0.35.1
Jinja2==3.1.2 Jinja2==3.1.2
joblib==1.2.0 joblib==1.2.0
lockfile==0.12.2
lxml==4.9.2
MarkupSafe==2.1.2 MarkupSafe==2.1.2
newspaper3k==0.2.8
nltk==3.8.1
numpy==1.24.2 numpy==1.24.2
orjson==3.8.5 orjson==3.8.5
outcome==1.2.0
pandas==2.0.2
passlib==1.7.4 passlib==1.7.4
pid==3.0.4
Pillow==9.5.0
pyasn1==0.4.8 pyasn1==0.4.8
pycparser==2.21 pycparser==2.21
pydantic==1.10.4 pydantic==1.10.4
PySocks==1.7.1
python-daemon==3.0.1
python-dateutil==2.8.2
python-dotenv==0.21.1 python-dotenv==0.21.1
python-jose==3.3.0 python-jose==3.3.0
python-multipart==0.0.5 python-multipart==0.0.5
pytz==2023.3
PyYAML==6.0 PyYAML==6.0
regex==2023.5.5
requests==2.28.2
requests-file==1.5.1
rfc3986==1.5.0 rfc3986==1.5.0
rsa==4.9 rsa==4.9
scikit-learn==1.0.2 scikit-learn==1.0.2
scipy==1.10.0 scipy==1.10.0
selenium==4.8.2
service==0.6.0
setproctitle==1.3.2
sgmllib3k==1.0.0
six==1.16.0 six==1.16.0
sniffio==1.3.0 sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4
SQLAlchemy==2.0.2 SQLAlchemy==2.0.2
starlette==0.22.0 starlette==0.22.0
threadpoolctl==3.1.0 threadpoolctl==3.1.0
tinysegmenter==0.3
tldextract==3.4.4
tqdm==4.65.0
trio==0.22.0
trio-websocket==0.10.2
typing_extensions==4.4.0 typing_extensions==4.4.0
tzdata==2023.3
ujson==5.7.0 ujson==5.7.0
urllib3==1.26.15
uvicorn==0.20.0 uvicorn==0.20.0
uvloop==0.17.0 uvloop==0.17.0
watchfiles==0.18.1 watchfiles==0.18.1
websockets==10.4 websockets==10.4
wsproto==1.2.0
xgboost==0.90 xgboost==0.90
from pydantic import BaseModel from pydantic import BaseModel, HttpUrl
# ---------- DATA SCHEMAS ---------- # ---------- DATA SCHEMAS ----------
class InputText(BaseModel): class InputText(BaseModel):
"""Schema to define the input structure of the data.""" """Schema to define the input structure of the data to predict."""
text: str text: str
class OutputPrediction(BaseModel): class OutputPrediction(BaseModel):
"""Schema to define the output's structure.""" """Schema to define the output predictions' structure."""
prediction: float prediction: float
class InputURL(BaseModel):
url: HttpUrl
class ScrapedComments(BaseModel):
"""Schema to define the output structure of the scraped comments."""
comments: list[str]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment