Я пытаюсь удалить результаты MLB BOX и информацию об игре.
import requests
from bs4 import BeautifulSoup
url = "https://www.sportsnet.ca/baseball/mlb/games/2618275/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Capture values from linescore__container class
linescore_container = soup.find("div", class_ = "linescore__container")
teams = [team.text.strip() for team in linescore_container.find_all("div", class_ = "team__name")]
scores = [score.text.strip() for score in linescore_container.find_all("div", class_ = "team__score")]
# Capture values from AllIningsPBP__Wrapper class
innings_wrapper = soup.find("div", class_ = "AllIningsPBP__Wrapper")
innings = [inning.text.strip() for inning in innings_wrapper.find_all("div", class_ = "AllIningsPBP__Inning")]
totals = [total.text.strip() for total in innings_wrapper.find_all("div", class_ = "AllIningsPBP__Total")]
# Print captured values
print("title:", soup.title.text)
print("teams:", teams)
print("scores:", scores)
print("innings:", innings)
print("totals:", totals)
///////////////////////////////////
Выход
Output is blank and it does not capture the AllIningsPBP__Wrapper values
title: Sportsnet.ca
teams: []
scores: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
innings: []
totals: []
pitchers: []
////////////////
import requests
from bs4 import BeautifulSoup
url = "https://www.mlb.com/gameday/orioles-vs-pirates/2024/04/07/745523/final/wrap"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find and extract the desired data
title = soup.title.text.strip()
teams = [team.text.strip() for team in soup.find_all("span", class_ = "team-name")]
scores = [score.text.strip() for score in soup.find_all("span", class_ = "team-score")]
innings_table = soup.find("table", class_ = "linescore-table")
if innings_table:
innings_data = []
for row in innings_table.find_all("tr"):
inning_cells = row.find_all("td")
inning_values = [cell.text.strip() for cell in inning_cells]
innings_data.append(inning_values)
innings_description = []
innings_description_wrapper = soup.find("div", class_ = "AllInningsPBP__Wrapper")
if innings_description_wrapper:
innings_description = [item.text.strip() for item in innings_description_wrapper.find_all("div", class_ = "AllIningsPBP__Inning")]
# Print the extracted data
print("Title:", title)
print("Teams:", teams)
print("Scores:", scores)
print("Innings Data:", innings_data)
print("Innings Description:", innings_description)
else:
print("Failed to retrieve data from the URL.")
/////////////////
import requests
from bs4 import BeautifulSoup
url = "https://www.thebaseballcube.com/content/box/CHN202303300~r/"
# Add a header to mimic a real browser request
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"
}
# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Extract title
title = soup.title.text.strip()
# Extract teams
teams = [team.text.strip() for team in soup.find_all("span", class_ = "bold")]
# Extract scores
scores = [score.text.strip() for score in soup.find_all("td", class_ = "bold")]
# Extract innings data
innings_data = []
for inning in soup.find_all("tr", class_ = "box_line_score"):
inning_text = " ".join(cell.text.strip() for cell in inning.find_all("td"))
innings_data.append(inning_text)
# Print the extracted data
print("Title:", title)
print("Teams:", teams)
print("Scores:", scores)
print("Innings Data:", innings_data)
else:
print("Failed to retrieve data from the URL.")
////////////////////////
import requests
from bs4 import BeautifulSoup
url = "https://plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the play-by-play section
play_by_play_section = soup.find("section", id = "play-by-play")
if play_by_play_section:
# Extract the play-by-play data
play_by_play = play_by_play_section.text.strip()
# Print the play-by-play data
print("Play-by-Play:")
print(play_by_play)
else:
print("Play-by-play section not found on the page.")
else:
print("Failed to retrieve data from the URL.")
////////////////////
Имеются ли на этих веб-сайтах какие-либо меры безопасности? Можем ли мы их обойти или нам нужно вручную удалить данные? Я пробовал несколько сайтов.
Ожидание отказа от результатов в ящике и всей игры от Play Info
Diamondbacks
4-6
Final
2 - 5
Atlanta
Braves
6-2
Play-by-Play Box Score
1 2 3 4 5 6 7 8 9 T H E
------------------------------------------
ARI 0 0 0 2 0 0 0 0 0 2 5 1
ATL 0 2 0 1 0 0 0 2 x 5 7 0
W: Chris Sale (1-0)
L: Ryne Nelson (0-2)
S: Pierce Johnson (1)
Game Time: 2:30
Play-by-play: Only Scoring Plays
1 2 3 4 5 6 7 8 9
1st Inning: Hide
T1
Ketel Marte strikes out swinging.
0-0
T1
Corbin Carroll grounds out to first baseman Matt Olson.
0-0
T1
Lourdes Gurriel Jr. lines out sharply to left fielder Jarred Kelenic.
0-0
Middle 1st
B1
Ronald Acuña Jr. pops out to first baseman Christian Walker.
0-0
B1
Ozzie Albies grounds out to first baseman Christian Walker.
0-0
B1
Austin Riley pops out to first baseman Christian Walker in foul territory.
0-0
2nd Inning: Hide
T2
Christian Walker strikes out swinging.
0-0
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # To run Chrome in headless mode
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
# Set up the Chrome driver
service = Service("path/to/your/chromedriver") # Replace with the path to your
chromedriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# URL of the webpage to scrape
url = "https://plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play"
# Load the webpage
driver.get(url)
# Wait for the play-by-play section to load
time.sleep(5) # Adjust the waiting time as needed
# Find the play-by-play section
play_by_play_section = driver.find_element(By.ID, "play-by-play")
# Get the play-by-play text
play_by_play_text = play_by_play_section.text
# Print the play-by-play data
print("Play-by-play:")
print(play_by_play_text)
# Close the browser
driver.quit()
Надеюсь это поможет
Понятия не имею, чего вы хотите в результате: у вас есть 5 фрагментов кода, нет описания ожидаемого результата по сравнению с тем, что вы получаете на самом деле. В чем проблема???
Но из того, что я вижу, мало предложений.
<table>
в HTML, используйте pandas для анализа таблицы .read_html()
. Он возвращает список фреймов данных/таблиц. Вы можете использовать индекс, чтобы извлечь тот, который вам нужен, или распечатать их все в цикле.Код:
import pandas as pd
url = "https://www.thebaseballcube.com/content/box/CHN202303300~r/"
dfs = pd.read_html(url)
for df in dfs:
print(df)
Выход:
0 1 ... 20 21
0 team 1 ... mlb rk 2023 links
1 Milwaukee Brewers 0 ... 4 statistics | roster | game log | NEXT Game
2 Chicago Cubs 0 ... 25 statistics | roster | game log | NEXT Game
[3 rows x 22 columns]
0 1 2 3 ... 17 18 19 20
0 ln player pos age ... avg ops s gm c gm
1 1 Christian Yelich lf 31.115 ... .000 .500 1 1251
2 2 Jesse Winker dh 29.225 ... .000 .000 1 550
3 3 Willy Adames ss 27.209 ... .333 .833 1 574
4 4 Rowdy Tellez 1b 28.014 ... .000 .250 1 430
5 5 William Contreras c 25.096 ... .250 .500 1 155
6 6 Luis Urias 3b 25.300 ... .000 .000 1 397
7 7 Cal Mitchell cf 24.022 ... .000 .000 1 32
8 8 Brian Anderson rf 29.315 ... .500 1.167 1 532
9 9 Brice Turang 2b 23.129 ... .333 .667 1 1
10 Totals Totals Totals Totals ... - - 9 3922
[11 rows x 21 columns]
0 1 2 3 ... 17 18 19 20
0 ln player pos age ... avg ops s gm c gm
1 1 Nico Hoerner 2b 25.321 ... .250 .500 1 250
2 2 Dansby Swanson ss 29.047 ... .750 1.500 1 829
3 3 Ian Happ lf 28.230 ... .500 1.250 1 679
4 4 Cody Bellinger cf 27.260 ... .000 .250 1 746
5 5 Trey Mancini dh 31.011 ... .333 .833 1 752
6 6 Yan Gomes c 35.254 ... .000 .000 1 974
7 7 Eric Hosmer 1b 33.157 ... .000 .000 1 1660
8 8 Patrick Wisdom 3b 31.215 ... .000 .250 1 285
9 9 Miles Mastrobuoni rf 27.150 ... .000 .000 1 10
10 Totals Totals Totals Totals ... - - 9 6185
[11 rows x 21 columns]
0 1 2 3 4 ... 16 17 18 19 20
0 seq player age ip h ... inpl era whip s gm c gm
1 1 Corbin Burnes (L 0-1) 28.159 5.0 4 ... 16 7.20 1.40 1 136
2 2 Peter Strzelecki 28.157 1.0 0 ... 3 0.00 0.00 1 31
3 3 Gus Varland 26.143 1.0 2 ... 4 0.00 3.00 1 1
4 4 Javy Guerra 27.182 1.0 0 ... 2 0.00 0.00 1 45
5 Totals Totals Totals 8.0 6 ... 25 - - NaN NaN
[6 rows x 21 columns]
0 1 2 3 4 ... 16 17 18 19 20
0 seq player age ip h ... inpl era whip s gm c gm
1 1 Marcus Stroman (W 1-0) 31.333 6.0 3 ... 12 0.00 1.00 1 205
2 2 Keegan Thompson 28.017 1.0 0 ... 2 0.00 1.00 1 62
3 3 Brad Boxberger 34.306 1.0 0 ... 1 0.00 1.00 1 485
4 4 Michael Fulmer 30.014 1.0 1 ... 2 0.00 1.00 1 205
5 Totals Totals Totals 9.0 4 ... 17 - - NaN NaN
[6 rows x 21 columns]
Код:
import pandas as pd
import requests
gameID = 2616911
url = f'https://mobile-statsv2.sportsnet.ca/livetracker?league=mlb&id = {gameID}'
jsonData = requests.get(url).json()
pbp = jsonData['data']['game']['innings']
pbp_df = pd.json_normalize(pbp,
record_path='plays')
Выход:
event_id event_code description is_out
0 16 65 Schwarber struck out. 1
1 31 51 Turner flied out to center fielder Thompson. 2
2 16 65 Realmuto struck out. 3
3 4 122 Fairchild hit an infield single to third.
4 16 65 Steer struck out. 1
.. ... ... ... ...
73 2 42
74 1 98 Hoffman pitching.
75 16 65 Thompson struck out. 1
76 34 72 Fairchild lined out to center fielder Rojas. 2
77 16 65 Steer struck out. 3
Код:
import requests
from bs4 import BeautifulSoup
url = 'https://plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play'
response = requests.get(url)
soup = BeautifulSoup(response.text)
pbp_elements = soup.find_all('div', {'class':'play-by-play'})
for each in pbp_elements:
print(each.text)
Выход:
T1
Gunnar Henderson strikes out swinging.
0-0
T1
Adley Rutschman lines out to right fielder Connor Joe.
0-0
T1
Ryan Mountcastle doubles (4) on a sharp line drive to right fielder Connor Joe.
0-0
T1
Anthony Santander lines out to center fielder Jack Suwinski.
0-0
B1
Oneil Cruz grounds out, second baseman Jorge Mateo to first baseman Ryan Mountcastle.
0-0
B1
Bryan Reynolds strikes out swinging, catcher James McCann to first baseman Ryan Mountcastle.
0-0
B1
Ke'Bryan Hayes singles on a line drive to right fielder Anthony Santander.
0-0
B1
Jack Suwinski pops out to third baseman Jordan Westburg in foul territory.
...
код выше у вас не работает?
Здравствуйте, спасибо за ответ. хочу удалить результаты: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '', '', '', ''] и информация Play by Play с сайта Plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play