Skip to content

Commit bf7949a

Browse files
Day 27 - Async Web Scraping
1 parent 921207b commit bf7949a

28 files changed

+28727
-0
lines changed

tutorial-reference/Day 27/Pipfile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[[source]]
2+
name = "pypi"
3+
url = "https://pypi.org/simple"
4+
verify_ssl = true
5+
6+
[dev-packages]
7+
8+
[packages]
9+
aiohttp = "*"
10+
11+
[requires]
12+
python_version = "3.8"

tutorial-reference/Day 27/Pipfile.lock

Lines changed: 111 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import asyncio
2+
from aiohttp import ClientSession
3+
import pathlib
4+
async def main():
5+
url = 'https://www.boxofficemojo.com/year/2019/'
6+
html_body = ""
7+
async with ClientSession() as session:
8+
async with session.get(url) as response:
9+
html_body = await response.read()
10+
return html_body
11+
12+
13+
html_data = asyncio.run(main())
14+
output_dir = pathlib.Path().resolve() / "snapshots"
15+
output_dir.mkdir(parents=True, exist_ok=True)
16+
output_file = output_dir / "2019.html"
17+
output_file.write_text(html_data.decode())
18+
# with open('path/to/output', 'w') as f:
19+
# f.write(html_data.decode())
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import asyncio
2+
from aiohttp import ClientSession
3+
import pathlib
4+
5+
async def fetch(url, session, year):
6+
async with session.get(url) as response:
7+
html_body = await response.read()
8+
return {"body": html_body, "year": year}
9+
10+
async def main(start_year=2020, years_ago=5):
11+
html_body = ""
12+
tasks = []
13+
# semaphore
14+
async with ClientSession() as session:
15+
for i in range(0, years_ago):
16+
year = start_year - i
17+
url = f'https://www.boxofficemojo.com/year/{year}/'
18+
print("year", year, url)
19+
tasks.append(
20+
asyncio.create_task(
21+
fetch(url, session, year)
22+
)
23+
)
24+
pages_content = await asyncio.gather(*tasks) # [{"body": "..", "year": 2020 }]
25+
return pages_content
26+
27+
28+
results = asyncio.run(main())
29+
30+
output_dir = pathlib.Path().resolve() / "snapshots"
31+
output_dir.mkdir(parents=True, exist_ok=True)
32+
33+
for result in results:
34+
current_year = result.get("year")
35+
html_data = result.get('body')
36+
output_file = output_dir / f"{current_year}.html"
37+
output_file.write_text(html_data.decode())
38+
# with open('path/to/output', 'w') as f:
39+
# f.write(html_data.decode())
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import asyncio
2+
from aiohttp import ClientSession
3+
import pathlib
4+
5+
async def fetch(url, session, year=None):
6+
async with session.get(url) as response:
7+
html_body = await response.read()
8+
return {"body": html_body, "year": year}
9+
10+
async def fetch_with_sem(sem, session, url, year=None):
11+
async with sem:
12+
return await fetch(url, session, year)
13+
14+
async def main(start_year=2020, years_ago=20):
15+
html_body = ""
16+
tasks = []
17+
# semaphore
18+
sem = asyncio.Semaphore(10)
19+
async with ClientSession() as session:
20+
for i in range(0, years_ago):
21+
year = start_year - i
22+
url = f'https://www.boxofficemojo.com/year/{year}/'
23+
print("year", year, url)
24+
tasks.append(
25+
asyncio.create_task(
26+
fetch_with_sem(sem, session, url, year=year)
27+
)
28+
)
29+
pages_content = await asyncio.gather(*tasks) # [{"body": "..", "year": 2020 }]
30+
return pages_content
31+
32+
33+
results = asyncio.run(main())
34+
35+
output_dir = pathlib.Path().resolve() / "snapshots"
36+
output_dir.mkdir(parents=True, exist_ok=True)
37+
38+
for result in results:
39+
current_year = result.get("year")
40+
html_data = result.get('body')
41+
output_file = output_dir / f"{current_year}.html"
42+
output_file.write_text(html_data.decode())
43+
# with open('path/to/output', 'w') as f:
44+
# f.write(html_data.decode())

tutorial-reference/Day 27/async.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import asyncio
2+
import time
3+
4+
iteration_times = [1, 3, 2, 4]
5+
6+
async def sleeper(seconds, i=-1):
7+
start_time = time.time()
8+
if i != -1:
9+
print(f"{i}\t{seconds}s")
10+
await asyncio.sleep(seconds)
11+
return time.time() - start_time
12+
13+
run_time = 0
14+
total_compute_run_time = 0
15+
async def main(): # coroutine
16+
global run_time
17+
global total_compute_run_time
18+
# await sleeper(1, i=0)
19+
tasks = []
20+
for i, second in enumerate(iteration_times):
21+
tasks.append(
22+
asyncio.create_task(
23+
sleeper(second, i=i)
24+
)
25+
)
26+
results = await asyncio.gather(*tasks)
27+
for run_time_result in results:
28+
total_compute_run_time += run_time_result
29+
if run_time_result > run_time:
30+
run_time = run_time_result
31+
32+
# main()
33+
asyncio.run(main())
34+
print(f"Ran for {run_time} seconds, with a total of {total_compute_run_time} and {run_time / total_compute_run_time }")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
def aaa():
2+
print("hello")
3+
4+
def zzz():
5+
print("hello")
6+
7+
# aaa()
8+
#...
9+
# zzz()

0 commit comments

Comments
 (0)