Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Jaime Collado
/
socialfairness-api
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
7dadb6b5
authored
Nov 09, 2023
by
Jaime Collado
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Fixed bug in parallelized endpoint
parent
085369f4
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
53 additions
and
24 deletions
app/ScraperNoticias
app/api.py
ScraperNoticias
@
26feabea
Subproject commit
95c75c27f876ccc49da32a0fceddfcaee625ea46
Subproject commit
26feabea82aef9bf695c168ccc9da90e0ade023e
app/api.py
View file @
7dadb6b5
from
multiprocessing
import
Pool
import
time
import
multiprocessing
as
mp
from
fastapi
import
FastAPI
,
HTTPException
,
status
from
fastapi.middleware.cors
import
CORSMiddleware
...
...
@@ -6,9 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
import
schemas
from
ScraperNoticias.globalScraper
import
GlobalScraper
import
time
# APP
app
=
FastAPI
()
#app = FastAPI(openapi_url=None) # Disable interactive docs
...
...
@@ -25,15 +24,14 @@ app.add_middleware(
# ---------- UTILS ----------
def
test_scraper
(
url
):
scraped_comments
=
scrap
(
url
)
if
scraped_comments
:
return
True
return
False
return
True
if
scrap
(
url
)
else
False
def
scrap
(
url
):
print
(
f
"Process {mp.current_process().name} started scraping {url}"
)
scraper
=
GlobalScraper
(
url
)
df
=
scraper
.
process
()
scraped_comments
=
df
.
comments
.
tolist
()
print
(
f
"Process {mp.current_process().name} ended scraping {url}"
)
return
scraped_comments
...
...
@@ -79,10 +77,6 @@ async def scrap_url(
Returns:
A list containing the scraped comments."""
print
(
"Antes del sleep"
)
time
.
sleep
(
10
)
print
(
"Después del sleep"
)
scraped_comments
=
scrap
(
url
.
url
)
if
not
scraped_comments
:
...
...
@@ -92,42 +86,77 @@ async def scrap_url(
)
return
{
"comments"
:
scraped_comments
}
@app.get
(
"/test-scraper"
,
response_model
=
schemas
.
ScrapersTested
,
tags
=
[
"Scraper"
])
@app.get
(
"/test-scraper
s
"
,
response_model
=
schemas
.
ScrapersTested
,
tags
=
[
"Scraper"
])
async
def
test_scrapers
():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
start
=
time
.
time
()
newspapers
=
{
"elpais"
:
"https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html"
,
"okdiario"
:
"https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826"
,
"elmundo"
:
"https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html"
,
"20minutos"
:
"https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/"
,
#
"20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial"
:
"https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/"
,
"marca"
:
"https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html"
,
"abc"
:
"https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https
%3
A
%2
F
%2
Fwww.abc.es
%2
F"
,
"elespanol"
:
"https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html"
,
"ntvespana"
:
"https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/"
,
"theobjective"
:
"https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/"
,
#
"theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital"
:
"https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/"
,
"vozpopuli"
:
"https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html"
,
"eldebate"
:
"https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html"
,
"alertadigital"
:
"https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result
=
{
k
:
False
for
k
,
_
in
newspapers
.
items
()}
mp
.
set_start_method
(
"spawn"
)
# Fix for FastAPI process shutting down when all processes end. Seen here: https://github.com/tiangolo/fastapi/issues/1487
with
mp
.
Pool
(
processes
=
8
)
as
pool
:
test_results
=
pool
.
map
(
test_scraper
,
newspapers
.
values
())
scrapers_test
=
{
newspaper
:
test_result
for
newspaper
,
test_result
in
zip
(
newspapers
.
keys
(),
test_results
)}
end
=
time
.
time
()
print
(
"Tiempo de ejecución:"
,
end
-
start
)
return
{
"scrapers"
:
scrapers_test
}
pool
=
Pool
(
processes
=
14
)
@app.get
(
"/test-scrapers-deprecated"
,
response_model
=
schemas
.
ScrapersTested
,
tags
=
[
"Scraper"
])
async
def
test_scrapers_deprecated
():
"""Tests whether the scrapers work or not.
Returns:
A dictionary with the name of the newspaper as key and a boolean as value.
"""
start
=
time
.
time
()
newspapers
=
{
"elpais"
:
"https://elpais.com/espana/2023-11-07/una-maniobra-judicial-que-amenaza-con-dejar-a-puigdemont-fuera-de-la-ley-de-amnistia.html"
,
"okdiario"
:
"https://okdiario.com/espana/policias-indignados-marlaska-cataluna-aguanto-6-dias-ferraz-cargo-20-minutos-11867826"
,
"elmundo"
:
"https://www.elmundo.es/internacional/2023/11/07/654ab70e21efa00b138b45cc.html"
,
# "20minutos": "https://www.20minutos.es/noticia/5188036/0/cancelan-cuenta-youtube-iker-jimenez-viva-libertad/",
"elconfidencial"
:
"https://www.elconfidencial.com/espana/2023-11-07/puigdemont-fiscal-tsunami-terrorismo-23j_3769680/"
,
"marca"
:
"https://www.marca.com/futbol/real-madrid/2023/11/08/654a7e8e22601d772a8b458a.html"
,
"abc"
:
"https://www.abc.es/espana/madrid/detenidos-empleados-empresa-desokupa-antecedentes-coaccionar-armas-20231107040950-nt.html?ref=https
%3
A
%2
F
%2
Fwww.abc.es
%2
F"
,
"elespanol"
:
"https://www.elespanol.com/espana/20231108/crece-protesta-sede-psoe-disturbios-acaban-batalla-campal-heridos/808169177_0.html"
,
"ntvespana"
:
"https://ntvespana.com/26/10/2023/la-burbuja-de-josue-cardenas-entrevistas-a-pio-moa-y-ramon-peralta-por-josue-cardenas/"
,
# "theobjective": "https://theobjective.com/espana/politica/2023-11-08/costa-sanchez-consejo-europeo/",
"elperiodistadigital"
:
"https://www.periodistadigital.com/periodismo/periodismo-online/20231108/ardio-ferraz-margenes-democracia-han-sobrepasado-video-689404954226/"
,
"vozpopuli"
:
"https://www.vozpopuli.com/espana/pedro-sanchez-concentraciones-sedes-socialistas.html"
,
"eldebate"
:
"https://www.eldebate.com/espana/20231108/el-ico-de-calvino-concedio-86400-a-la-galeria-de-arte-del-hijo-mayor-de-pujol-con-la-que-blanqueaba-comisiones_152023.html"
,
"alertadigital"
:
"https://www.alertadigital.com/2023/11/07/de-la-primavera-arabe-al-otono-espanol/"
}
test_result
=
{
k
:
False
for
k
,
_
in
newspapers
.
items
()}
for
newspaper
,
url
in
newspapers
.
items
():
test_result
[
newspaper
]
=
pool
.
apply_async
(
test_scraper
,
[
url
])
pool
.
close
()
pool
.
join
()
test_result
[
newspaper
]
=
test_scraper
(
url
)
final
=
{
k
:
v
.
get
()
for
k
,
v
in
test_result
.
items
()}
end
=
time
.
time
()
print
(
"Tiempo de ejecución:"
,
end
-
start
)
return
{
"scrapers"
:
final
}
return
{
"scrapers"
:
test_result
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment