Commit f581edb0 by Jaime Collado

Scraper method included

parent 15168362
......@@ -51,3 +51,6 @@ docs/_build/
# Environments
venv/
# Scraper
ScraperNoticias/
from fastapi import FastAPI
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
import schemas
from ScraperNoticias.globalScraper import GlobalScraper
# APP
app = FastAPI()
#app = FastAPI(openapi_url=None) # Disable interactive docs
......@@ -46,3 +48,24 @@ async def predict_toxicity(
# TODO: Aquí invocamos al modelo de toxicidad y devolvemos la predicción.
return {"prediction": 0.0}
@app.post("/scrap", response_model=schemas.ScrapedComments, tags=["Scraper"])
async def scrap_url(
url: schemas.InputURL
):
"""Scraps comments from a given URL.
Args:
url: URL containing news.
Returns:
A list containing the scraped comments."""
scraper = GlobalScraper(url.url)
df = scraper.process()
scraped_comments = df.comments.tolist()
if not scraped_comments:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Comments not found"
)
return {"comments": scraped_comments}
\ No newline at end of file
anyio==3.6.2
async-generator==1.10
attrs==22.2.0
bcrypt==4.0.1
beautifulsoup4==4.11.2
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.3
cryptography==39.0.1
cssselect==1.2.0
dnspython==2.3.0
docutils==0.20.1
ecdsa==0.18.0
email-validator==1.3.1
exceptiongroup==1.1.1
fastapi==0.89.1
feedfinder2==0.0.4
feedparser==6.0.10
filelock==3.12.0
greenlet==2.0.2
h11==0.14.0
httpcore==0.16.3
......@@ -15,32 +25,62 @@ httptools==0.5.0
httpx==0.23.3
idna==3.4
itsdangerous==2.1.2
jieba3k==0.35.1
Jinja2==3.1.2
joblib==1.2.0
lockfile==0.12.2
lxml==4.9.2
MarkupSafe==2.1.2
newspaper3k==0.2.8
nltk==3.8.1
numpy==1.24.2
orjson==3.8.5
outcome==1.2.0
pandas==2.0.2
passlib==1.7.4
pid==3.0.4
Pillow==9.5.0
pyasn1==0.4.8
pycparser==2.21
pydantic==1.10.4
PySocks==1.7.1
python-daemon==3.0.1
python-dateutil==2.8.2
python-dotenv==0.21.1
python-jose==3.3.0
python-multipart==0.0.5
pytz==2023.3
PyYAML==6.0
regex==2023.5.5
requests==2.28.2
requests-file==1.5.1
rfc3986==1.5.0
rsa==4.9
scikit-learn==1.0.2
scipy==1.10.0
selenium==4.8.2
service==0.6.0
setproctitle==1.3.2
sgmllib3k==1.0.0
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4
SQLAlchemy==2.0.2
starlette==0.22.0
threadpoolctl==3.1.0
tinysegmenter==0.3
tldextract==3.4.4
tqdm==4.65.0
trio==0.22.0
trio-websocket==0.10.2
typing_extensions==4.4.0
tzdata==2023.3
ujson==5.7.0
urllib3==1.26.15
uvicorn==0.20.0
uvloop==0.17.0
watchfiles==0.18.1
websockets==10.4
wsproto==1.2.0
xgboost==0.90
from pydantic import BaseModel
from pydantic import BaseModel, HttpUrl
# ---------- DATA SCHEMAS ----------
class InputText(BaseModel):
"""Schema to define the input structure of the data."""
"""Schema to define the input structure of the data to predict."""
text: str
class OutputPrediction(BaseModel):
"""Schema to define the output's structure."""
"""Schema to define the output predictions' structure."""
prediction: float
class InputURL(BaseModel):
url: HttpUrl
class ScrapedComments(BaseModel):
"""Schema to define the output structure of the scraped comments."""
comments: list[str]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment