Function bodies 51 total
get_player_ids function · python · L41-L60 (20 LOC)scripts/acquiring/transfermarkt-api.py
def get_player_ids(season: int) -> List[int]:
"""Get the player ids from the players asset from transfermarkt-scraper source.
Returns:
List[int]: List of player ids
"""
players_asset_path = f"data/raw/transfermarkt-scraper/{season}/players.json.gz"
# read lines from a zipped file
with gzip.open(players_asset_path, mode="r") as z:
players = [json.loads(line) for line in z.readlines()]
player_ids = [
int(player["href"].split("/")[-1])
for player in players
]
logging.info(f"Fetched {len(player_ids)} player ids from {players_asset_path}")
return player_idsfetch_data function · python · L63-L85 (23 LOC)scripts/acquiring/transfermarkt-api.py
async def fetch_data(session, url, player_id):
"""Fetch data from the API for a given URL and player ID.
Args:
session (aiohttp.ClientSession): The aiohttp session
url (str): The API URL
player_id (int): The player ID
Returns:
dict: The API response and player ID
"""
headers = {
'Content-Type': 'application/json',
'User-Agent': USER_AGENT
}
async with session.get(url=url, headers=headers, ssl=False) as response:
try:
json = await response.json()
return {"response": json, "player_id": player_id}
except aiohttp.ContentTypeError as e:
logging.error(f"Failed to fetch data for player {player_id}: {e}")
return {"response": None, "player_id": player_id}get_market_values function · python · L88-L106 (19 LOC)scripts/acquiring/transfermarkt-api.py
async def get_market_values(player_ids: List[int]) -> List[dict]:
"""Get the market value data from the API for each player id.
Args:
player_ids (List[int]): List of player ids
Returns:
List[dict]: List of dicts with market value data
"""
logging.info(f"Requesting market values for {len(player_ids)} players")
async with aiohttp.ClientSession() as session:
tasks = [fetch_data(session, MARKET_VALUES_API + str(player_id), player_id) for player_id in player_ids]
# Use asyncio.gather to execute the tasks concurrently
responses = await asyncio.gather(*tasks)
return responsesget_transfers function · python · L109-L127 (19 LOC)scripts/acquiring/transfermarkt-api.py
async def get_transfers(player_ids: List[int]) -> List[dict]:
"""Get the transfer history data from the API for each player id.
Args:
player_ids (List[int]): List of player ids
Returns:
List[dict]: List of dicts with transfer history data
"""
logging.info(f"Requesting transfer history for {len(player_ids)} players")
async with aiohttp.ClientSession() as session:
tasks = [fetch_data(session, TRANSFERS_API + str(player_id), player_id) for player_id in player_ids]
# Use asyncio.gather to execute the tasks concurrently
responses = await asyncio.gather(*tasks)
return responsespersist_data function · python · L129-L137 (9 LOC)scripts/acquiring/transfermarkt-api.py
def persist_data(data: List[dict], path: str) -> None:
"""Persist the data to a file.
Args:
data (List[dict]): List of dicts with data to persist
path (str): Path where to store the data
"""
with open(path, "w") as f:
f.writelines(json.dumps(item) + "\n" for item in data)run_for_season function · python · L139-L168 (30 LOC)scripts/acquiring/transfermarkt-api.py
def run_for_season(season: int) -> None:
"""Run all steps for a given season.
Args:
season (int): The season to process
"""
target_market_values_path = f"data/raw/transfermarkt-api/{season}/market_values.json"
target_transfers_path = f"data/raw/transfermarkt-api/{season}/transfers.json"
logging.info(f"Starting player data acquisition for season {season}")
# create target directories if they do not exist
pathlib.Path(target_market_values_path).parent.mkdir(parents=True, exist_ok=True)
pathlib.Path(target_transfers_path).parent.mkdir(parents=True, exist_ok=True)
# get player IDs for the season
player_ids = get_player_ids(season)
# collect market values and transfers for players in SEASON
market_values = asyncio.run(get_market_values(player_ids))
transfers = asyncio.run(get_transfers(player_ids))
# filter out player ids in responses that are not in the original list
transfers = [item for item in transfers if iteAsset.all method · python · L66-L74 (9 LOC)scripts/acquiring/transfermarkt-scraper.py
def all(cls):
"""Get an ordered list of assets to be acquired.
Asset acquisition have dependecies between each other. This list returns the right order for asset
acquisition steps to run.
"""
assets = [Asset(name) for name in cls.asset_parents if name != 'competitions']
for asset in assets:
asset.set_parent()
return assetsIf a scraper extracted this row, it came from Repobility (https://repobility.com)
run_tfmkt function · python · L76-L102 (27 LOC)scripts/acquiring/transfermarkt-scraper.py
def run_tfmkt(crawler, season=None, parents_file=None):
"""Run a tfmkt CLI command and return its stdout output.
Args:
crawler (str): The crawler to run (e.g. 'clubs', 'players', 'confederations').
season (int, optional): The season year.
parents_file (str, optional): Path to the parents JSONL file.
Returns:
str: The stdout output (JSONL).
"""
cmd = ["tfmkt", crawler]
if season is not None:
cmd.extend(["-s", str(season)])
if parents_file is not None:
cmd.extend(["-p", str(parents_file)])
logging.info(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logging.error(f"tfmkt failed with return code {result.returncode}")
if result.stderr:
logging.error(f"stderr: {result.stderr}")
raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)
return result.stdoutacquire_asset function · python · L104-L122 (19 LOC)scripts/acquiring/transfermarkt-scraper.py
def acquire_asset(asset, season):
"""Acquire a single asset for a given season using the tfmkt CLI."""
season_dir = pathlib.Path(f"data/raw/transfermarkt-scraper/{season}")
season_dir.mkdir(parents=True, exist_ok=True)
parent_file = asset.parent.file_full_path(season) if asset.parent else None
output_file = asset.file_path(season)
logging.info(f"Acquiring {asset.name} for season {season}")
output = run_tfmkt(asset.name, season=season, parents_file=parent_file)
# Remove existing file if present
if output_file.exists():
os.remove(str(output_file))
# Gzip and write output
with gzip.open(str(output_file), 'wt') as f:
f.write(output)
logging.info(f"Wrote {output_file}")acquire_on_local function · python · L124-L141 (18 LOC)scripts/acquiring/transfermarkt-scraper.py
def acquire_on_local(asset, seasons):
def assets_list(asset: str) -> List[Asset]:
if asset == 'all':
assets = Asset.all()
else:
asset_obj = Asset(name=asset)
asset_obj.set_parent()
assets = [asset_obj]
return assets
expanded_seasons = seasons_list(seasons)
expanded_assets = assets_list(asset)
for season in expanded_seasons:
for asset_obj in expanded_assets:
acquire_asset(asset_obj, season)publish_to_dataworld function · python · L15-L83 (69 LOC)scripts/synching/sync-dataworld.py
def publish_to_dataworld(folder):
"""Push the contents of the folder to data.world's dataset dcereijo/player-scores
:param folder: dataset folder path
"""
with open(folder + '/dataset-metadata.json') as metadata_file:
metadata = json.load(metadata_file)
dw_files = []
for resource in metadata['resources']:
filename = resource['path']
if not filename.endswith('.gz'):
filename += '.gz'
url = f'{R2_PUBLIC_URL}/{filename}'
dw_files.append(
{
'name': resource['title'],
'description': (resource['description'])[0:120],
'labels': ['clean data'],
'source': {
'url': url
}
}
)
metadata['summary'] = metadata['description']
metadata['description'] = "Clean, structured and automatically updated football (soccer) data from Transfermarkt"
metadata['tags'] = metadata['keywords']
metadata['license'] = metadata['licenses'][0]['CC0']
metadata['visibility'] = 'OPEN'
del metadata['keyworpublish_to_kaggle function · python · L11-L24 (14 LOC)scripts/synching/sync-kaggle.py
def publish_to_kaggle(folder, message):
"""Push the contents of the folder to Kaggle datasets
:param folder: dataset folder path
:param message: a string message with version notes
"""
api = KaggleApi()
api.authenticate()
# https://github.com/Kaggle/kaggle-api/blob/master/kaggle/api/kaggle_api_extended.py#L1317
api.dataset_create_version(
folder=folder,
version_notes=message
)sync_to_r2 function · python · L22-L41 (20 LOC)scripts/synching/sync-r2.py
def sync_to_r2(prep_dir):
s3 = boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)
for filepath in sorted(glob.glob(os.path.join(prep_dir, "*.csv.gz"))):
filename = os.path.basename(filepath)
key = f"{R2_PREFIX}/{filename}"
print(f" {filename} -> s3://{R2_BUCKET}/{key}")
s3.upload_file(filepath, R2_BUCKET, key)
for extra in ["dataset-metadata.json", "transfermarkt-datasets.zip"]:
path = os.path.join(prep_dir, extra)
if os.path.exists(path):
key = f"{R2_PREFIX}/{extra}"
print(f" {extra} -> s3://{R2_BUCKET}/{key}")
s3.upload_file(path, R2_BUCKET, key)top_n_players function · python · L10-L15 (6 LOC)streamlit/pages/03_💰_app:_player_value.py
def top_n_players(df: pd.DataFrame, n: int) -> List[str]:
return (df
.sort_values(by="market_value_in_eur", ascending=False)
.head(n)["name"]
.values
)load_td function · python · L20-L33 (14 LOC)streamlit/utils.py
def load_td() -> Dataset:
"""Instantiate and initialise a Dataset, so it can be used in the app.
Returns:
Dataset: A transfermark_datasets.core.Dataset that is initialised and ready to be used.
"""
if os.environ["STREAMLIT"] == "cloud":
os.system("dvc pull data/prep")
td = Dataset()
td.load_assets()
return tdRepobility — the code-quality scanner for AI-generated software · https://repobility.com
read_file_contents function · python · L35-L44 (10 LOC)streamlit/utils.py
def read_file_contents(file_path: str):
"""Read a markdown file in disk as a string.
Args:
markdown_file (str): The path of the file to be read.
Returns:
str: The contents of the file as a string.
"""
return Path(file_path).read_text()draw_dataset_index function · python · L46-L61 (16 LOC)streamlit/utils.py
def draw_dataset_index(td: Dataset) -> None:
md_index_lines = []
for asset_name, asset in td.assets.items():
if asset.public:
titelized_asset_name = titleize(asset.frictionless_resource_name).lower()
asset_anchor = dasherize(asset.frictionless_resource_name).lower()
md_index_line = f"* [{titelized_asset_name}](#{asset_anchor})"
md_index_lines.append(
md_index_line
)
st.sidebar.markdown(
"\n".join(md_index_lines)
)draw_asset function · python · L63-L90 (28 LOC)streamlit/utils.py
def draw_asset(asset: Asset) -> None:
"""Draw a transfermarkt-dataset asset summary
Args:
asset_name (str): Name of the asset
"""
left_col, right_col = st.columns([5,1])
title = titleize(asset.frictionless_resource_name).lower()
left_col.subheader(title)
left_col.markdown(asset.description)
delta = get_records_delta(asset)
right_col.metric(
label="# of records",
value=len(asset.prep_df),
delta=delta,
help="Total number of records in the asset / New records in the past week"
)
with st.expander("Attributes"):
draw_asset_schema(asset)
with st.expander("Explore"):
draw_asset_explore(asset)
st.markdown("---")draw_asset_explore function · python · L92-L146 (55 LOC)streamlit/utils.py
def draw_asset_explore(asset: Asset) -> None:
"""Draw dataframe together with dynamic filters for exploration.
Args:
asset (Asset): The asset to draw the explore for.
"""
tagged_columns = [
field.name
for field in asset.schema.get_fields_by_tag("explore")
]
default_columns = list(asset.prep_df.columns[:4].values)
if len(tagged_columns) > 0:
columns = tagged_columns
else:
columns = default_columns
filter_columns = st.multiselect(
label="Search by",
options=asset.prep_df.columns,
default=columns
)
if len(filter_columns) == 0:
filter_columns = columns
st_cols = st.columns(len(filter_columns))
df = asset.prep_df.copy()
for st_col, at_col in zip(st_cols, filter_columns):
options = list(df[at_col].unique())
selected = st_col.selectbox(
label=at_col,
options=options,
key=(asset.name + "-" + at_col)
render_svg function · python · L156-L186 (31 LOC)streamlit/utils.py
def render_svg(svg, caption):
"""Renders the given svg string."""
b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
html_style = """
<style>
figure {
border: 1px #cccccc solid;
padding: 4px;
margin: auto;
}
figcaption {
background-color: black;
color: white;
font-style: italic;
padding: 2px;
text-align: center;
}
</style>
"""
html_image = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
html_caption = f"<figcaption>{caption}</figcaption>"
html_figure = f"""
{html_style}
<figure>
{html_image}
{html_caption}
</figure>
"""
st.write(html_figure, unsafe_allow_html=True)get_records_delta function · python · L194-L211 (18 LOC)streamlit/utils.py
def get_records_delta(asset: Asset, offset: int = 7) -> int:
"""Get an asset records' delta (number of new records in last n days).
Args:
asset (Asset): The asset to be calculating the delta from.
offset (int, optional): Number in days to be consider for the delta calculation. Defaults to 7.
Returns:
int: Number of records.
"""
df = asset.prep_df
if "date" in df.columns:
dt = pd.to_datetime(df["date"])
delta = len(df[dt > (datetime.now() - timedelta(days=offset))])
return delta
else:
return NoneCurAppearancesAsset.__init__ method · python · L17-L50 (34 LOC)transfermarkt_datasets/assets/cur_appearances.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(name="appearance_id", type="integer"),
Field(name="game_id", type="integer"),
Field(name="player_id", type="integer"),
Field(
name="player_club_id",
type="integer",
description="ID of the club that the player belonged to at the time of the game."
),
Field(
name="player_current_club_id",
type="integer",
description="ID of the club that the player currently belongs to."
),
Field(name="date", type="date", tags=["explore"]),
Field(name="player_name", type="string", tags=["explore"]),
Field(name="competition_id", type="string"),
Field(name="yellow_cards", type="integer"),
Field(name="red_cards", type="integer"),
Field(name="goals", type="integer"),
Field(name="assists", type="integer"),
CurClubGamesAsset.__init__ method · python · L35-L65 (31 LOC)transfermarkt_datasets/assets/cur_club_games.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema()
self.schema.add_field(Field(name='club_id', type='integer'))
self.schema.add_field(Field(name='game_id', type='integer'))
self.schema.add_field(Field(name='own_goals', type='integer'))
self.schema.add_field(Field(name='own_position', type='integer'))
self.schema.add_field(Field(name='own_manager_name', type="string", tags=["explore"]))
self.schema.add_field(Field(name='opponent_id', type='integer'))
self.schema.add_field(Field(name='opponent_goals', type='integer'))
self.schema.add_field(Field(name='opponent_position', type='integer'))
self.schema.add_field(Field(name='opponent_manager_name', type='string'))
self.schema.add_field(Field(
name="hosting",
type="string",
description="'Home' if the game took place at the club home stadium and 'Away' if at its opponent stadium"
))
self.schema.add_field(Field(
Powered by Repobility — scan your code at https://repobility.com
CurClubsAsset.__init__ method · python · L19-L60 (42 LOC)transfermarkt_datasets/assets/cur_clubs.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema()
self.schema.add_field(Field(name='club_id', type='integer'))
self.schema.add_field(Field(name='club_code', type='string'))
self.schema.add_field(Field(name='name', type='string'))
self.schema.add_field(Field(name='domestic_competition_id', type='string', tags=["explore"]))
self.schema.add_field(Field(
name='total_market_value',
type='number',
description="Aggregated players' Transfermarkt market value in millions of pounds"
)
)
self.schema.add_field(Field(name='squad_size', type='integer'))
self.schema.add_field(Field(name='average_age', type='number'))
self.schema.add_field(Field(name='foreigners_number', type='integer'))
self.schema.add_field(Field(name='foreigners_percentage', type='number'))
self.schema.add_field(Field(name='national_team_players', type='integer'))
self.schema.add_field(Field(CurCompetitionsAsset.__init__ method · python · L16-L43 (28 LOC)transfermarkt_datasets/assets/cur_competitions.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema()
self.schema.add_field(Field(name="competition_id", type="string"))
self.schema.add_field(Field(name="competition_code", type="string"))
self.schema.add_field(Field(name="name", type="string"))
self.schema.add_field(Field(name="type", type="string"))
self.schema.add_field(Field(name="sub_type", type="string"))
self.schema.add_field(Field(
name="is_major_national_league",
type="boolean",
description="Competition is a major national league in the confederation."
)
)
self.schema.add_field(Field(name="country_id", type="integer"))
self.schema.add_field(Field(name="country_name", type="string"))
self.schema.add_field(Field(name="domestic_league_code", type="string"))
self.schema.add_field(Field(name="confederation", type="string", tags=["explore"]))
self.schema.add_field(Field(
name="url",
typCurGameEventsAsset.__init__ method · python · L15-L55 (41 LOC)transfermarkt_datasets/assets/cur_game_events.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(
name='game_event_id',
type='string',
description="Surrogate key"
),
Field(name='date', type='date'),
Field(name='game_id', type='integer'),
Field(name='player_id', type='integer'),
Field(name='club_id', type='integer'),
Field(name='club_name', type='string'),
Field(name='type', type='string'),
Field(name='minute', type='integer'),
Field(name='description', type='string'),
Field(
name='player_in_id',
type='string',
description="For subsitution events, ID of the player who joins the game. Null otherwise"
),
Field(
name='player_assist_id',
type='string',
description="For goal events, ID of the player who did the assist. Null otherwise"
),
]
)
sCurGameLineupsAsset.__init__ method · python · L13-L39 (27 LOC)transfermarkt_datasets/assets/cur_game_lineups.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(
name='game_lineups_id',
type='string',
description="Surrogate key"
),
Field(name='game_id', type='integer'),
Field(name='player_id', type='integer'),
Field(name='club_id', type='integer'),
Field(name='type', type='string'),
Field(name='player_name', type='string'),
Field(name='team_captain', type='string'),
Field(name='number', type='string'),
Field(name='position', type='string'),
Field(name='date', type='date'),
]
)
self.schema.primary_key = [
'game_id',
'player_id',
'club_id',
]CurGamesAsset.__init__ method · python · L14-L49 (36 LOC)transfermarkt_datasets/assets/cur_games.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(name='game_id', type='integer'),
Field(name='competition_id', type='string', tags=["explore"]),
Field(name='competition_type', type='string'),
Field(name='season', type='integer', tags=["explore"]),
Field(name='round', type='string', tags=["explore"]),
Field(name='date', type='date', tags=["explore"]),
Field(name='home_club_id', type='integer'),
Field(name='away_club_id', type='integer'),
Field(name='home_club_goals', type='integer'),
Field(name='away_club_goals', type='integer'),
Field(name='aggregate', type='string'),
Field(name='home_club_position', type='integer'),
Field(name='away_club_position', type='integer'),
Field(name='home_club_name', type='string', tags=["explore"]),
Field(name='away_club_name', type='string', tags=["exploreCurPlayersAsset.__init__ method · python · L21-L72 (52 LOC)transfermarkt_datasets/assets/cur_players.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(name="player_id", type="integer"),
Field(name="name", type="string"),
Field(name="current_club_id", type="integer"),
Field(name="current_club_name", type="string", tags=["explore"]),
Field(name="country_of_citizenship", type="string"),
Field(name="country_of_birth", type="string"),
Field(name="city_of_birth", type="string"),
Field(name="date_of_birth", type="date"),
Field(name="position", type="string"),
Field(name="sub_position", type="string"),
Field(name="foot", type="string"),
Field(name="height_in_cm", type="integer"),
Field(
name="market_value_in_eur",
type="number",
description="The player's current market value in EUR."
),
Field(
name="highest_market_value_in_eur",
type="number",
CurPlayerValuationsAsset.__init__ method · python · L17-L38 (22 LOC)transfermarkt_datasets/assets/cur_player_valuations.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(name='date', type='date'),
Field(name='player_id', type='integer'),
Field(name='current_club_name', type='string'),
Field(name='current_club_id', type='integer'),
Field(name='market_value_in_eur', type='number'),
Field(
name='player_club_domestic_competition_id',
type='string',
tags=["explore"]
)
]
)
self.schema.primary_key = ['player_id', 'date']
self.schema.foreign_keys = [
{"fields": "player_id", "reference": {"resource": "cur_players", "fields": "player_id"}}
]CurTransfersAsset.__init__ method · python · L15-L44 (30 LOC)transfermarkt_datasets/assets/cur_transfers.py
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.schema = Schema(
fields=[
Field(name="player_id", type="integer"),
Field(name="player_name", type="string"),
Field(name="transfer_date", type="date"),
Field(name="transfer_season", type="string"),
Field(name="from_club_id", type="integer"),
Field(name="to_club_id", type="integer"),
Field(name="from_club_name", type="string", tags=["explore"]),
Field(name="to_club_name", type="string", tags=["explore"]),
Field(
name="transfer_fee",
type="number",
description="The transfer fee in EUR. Null if unknown, 0 if free transfer."
),
Field(
name="market_value_in_eur",
type="number",
descriptWant fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
Asset.__init__ method · python · L27-L41 (15 LOC)transfermarkt_datasets/core/asset.py
def __init__(
self,
settings: dict = None) -> None:
self._prep_df = None
self.settings = settings
self.log = logging.getLogger("main")
self.prep_location = "data/prep"
self.datapackage_descriptor_path = f"{self.prep_location}/dataset-metadata.json"
if not self.file_name:
file_name = self.name.replace("base_", "")
self.file_name = file_name + ".csv"
self.schema = Schema()Asset.prep_df method · python · L51-L69 (19 LOC)transfermarkt_datasets/core/asset.py
def prep_df(self, df):
df_type = type(df)
if df_type != pd.DataFrame:
raise InvalidPreparedDF(f"Invalid df type: {df_type}")
else:
df_cols = list(df.columns.values)
df_cols_set = set(df_cols)
schema_cols_set = set(self.schema.field_names)
set_difference = (df_cols_set - schema_cols_set).union(
schema_cols_set - df_cols_set
)
if set_difference != set():
raise InvalidPreparedDF(
f"{self.name}: fields do not match provided schema: {set_difference}"
)
field_names = self.schema.field_names
self._prep_df = df[field_names]Asset.load_from_prep method · python · L87-L92 (6 LOC)transfermarkt_datasets/core/asset.py
def load_from_prep(self):
"""Load prepared dataset from the local to a pandas dataframe.
"""
self.prep_df = pd.read_csv(
filepath_or_buffer=self.prep_path
)Asset.schema_as_dataframe method · python · L105-L129 (25 LOC)transfermarkt_datasets/core/asset.py
def schema_as_dataframe(self) -> pd.DataFrame:
"""Render the asset schema as a pandas dataframe.
Returns:
pd.DataFrame: A pandas dataframe representing the asset schema.
"""
fields = [field.name for field in self.schema.fields]
types = [field.type for field in self.schema.fields]
descriptions = [field.description for field in self.schema.fields]
sample_values = [
get_sample_values(self.prep_df, field.name, 3)
for field in self.schema.fields
]
df = pd.DataFrame(
data=dict(
description=descriptions,
type=types,
sample_values=sample_values
),
index=fields
)
return dfAsset.as_frictionless_resource method · python · L131-L142 (12 LOC)transfermarkt_datasets/core/asset.py
def as_frictionless_resource(self) -> Resource:
detector = Detector(schema_sync=True)
resource = Resource(
title=self.frictionless_resource_name,
path=self.file_name_uncompressed,
detector=detector,
description=self.description,
schema=self.schema.as_frictionless_schema()
)
return resourceRawAsset.__init__ method · python · L148-L156 (9 LOC)transfermarkt_datasets/core/asset.py
def __init__(self, settings: dict = None) -> None:
super().__init__(settings)
self.raw_df = None
self.raw_files_path = "data/raw/transfermarkt-scraper"
if not self.raw_file_name:
file_name = self.name.replace("base_", "")
self.raw_file_name = file_name + ".json.gz"RawAsset.load_raw method · python · L158-L188 (31 LOC)transfermarkt_datasets/core/asset.py
def load_raw(self):
raw_dfs = []
if "competitions" in self.raw_file_name:
df = pd.read_json(
f"data/competitions.json",
lines=True,
convert_dates=True,
orient={'index', 'date'}
)
raw_dfs.append(df)
else:
seasons = read_config()["defintions"]["seasons"]
for season in seasons:
season_file = f"{self.raw_files_path}/{season}/{self.raw_file_name}"
self.log.debug("Reading raw data from %s", season_file)
df = pd.read_json(
season_file,
lines=True,
convert_dates=True,
orient={'index', 'date'}
)
df["season"] = season
df["season_file"] = season_file
if len(df) > 0:
raw_dfs.append(df)
self.raw_df = pd.concat(raw_dfs, axis=0)Dataset.__init__ method · python · L26-L54 (29 LOC)transfermarkt_datasets/core/dataset.py
def __init__(
self,
config=None,
config_file="config.yml",
assets_root=".",
assets_relative_path="transfermarkt_datasets/assets",
) -> None:
self.assets_root = assets_root
self.assets_relative_path = assets_relative_path
self.config = config or read_config(config_file)
self.prep_folder_path = "data/prep"
self.assets = {}
if self.config.get("logging"):
logging.config.dictConfig(self.config["logging"])
else:
logging.basicConfig()
self.log = logging.getLogger("main")
for file in pathlib.Path(os.path.join(self.assets_root, self.assets_relative_path)).glob("**/*.py"):
filename = file.name
class_ = self.get_asset_def(filename.split(".")[0])
asset = class_()
self.assets[asset.name] = assetIf a scraper extracted this row, it came from Repobility (https://repobility.com)
Dataset.load_assets method · python · L69-L74 (6 LOC)transfermarkt_datasets/core/dataset.py
def load_assets(self):
"""Load all assets in the dataset from local.
"""
for asset_name, asset in self.assets.items():
if asset.public:
asset.load_from_prep()Dataset.get_relationships method · python · L82-L107 (26 LOC)transfermarkt_datasets/core/dataset.py
def get_relationships(self) -> List[Dict]:
"""Get assets relationships.
Relationships are defined by the assets set of foreign keys.
Returns:
List[Dict]: A list of relationships (source -> target)
"""
relationships = []
for asset_name, asset in self.assets.items():
if asset.schema.foreign_keys:
for foreign_key in asset.schema.foreign_keys:
reference = foreign_key["reference"]
relationship = {
"from": asset_name,
"to": reference["resource"],
"on": {
"source": foreign_key["fields"],
"target": reference["fields"]
}
}
relationships.append(
relationship
)
return relationshipsDataset.as_frictionless_package method · python · L109-L138 (30 LOC)transfermarkt_datasets/core/dataset.py
def as_frictionless_package(self, basepath=None, exclude_private=False) -> None:
"""Create an save to local a file descriptor tha defines a "datapackage" for this dataset.
Args:
basepath (str, optional): Base path of prepared files. It defaults to the "prep" folder path.
"""
base_path = basepath or self.prep_folder_path
package = Package(basepath=base_path)
# full spec at https://specs.frictionlessdata.io/data-package/
package.title = "Football Data from Transfermarkt"
package.description = "Clean, structured and automatically updated football (soccer) data from Transfermarkt"
package.keywords = [
"football", "players", "stats", "statistics", "data",
"soccer", "games", "matches"
]
package.id = "davidcariboo/player-scores"
package.licenses = [{
"CC0": "Public Domain"
}]
with open('transfermarkt_datasets/datapackage_description.md') as datapackage_description_file:
package.description = datapackaDataset.write_datapackage method · python · L140-L150 (11 LOC)transfermarkt_datasets/core/dataset.py
def write_datapackage(self):
pkg = self.as_frictionless_package()
pkg_as_json = json.loads(pkg.to_json())
# recursively sort a json object by key
def sort_dict_by_key(d):
return {k: sort_dict_by_key(v) if isinstance(v, dict) else v for k, v in sorted(d.items())}
# write the sorted json to a file
with open("data/prep/dataset-metadata.json", "w") as f:
json.dump(sort_dict_by_key(pkg_as_json), f, indent=2)Field.__init__ method · python · L7-L19 (13 LOC)transfermarkt_datasets/core/schema.py
def __init__(
self,
name: str,
type: str,
description: str = None,
tags: List[str] = None,
form: str = None) -> None:
self.name = name
self.type = type
self.description = description
self.form = form
self.tags = tags or []Field.as_frictionless_field method · python · L24-L32 (9 LOC)transfermarkt_datasets/core/schema.py
def as_frictionless_field(self) -> frictionless.Field:
fl_field = frictionless.Field(
name=self.name,
type=self.type,
description=self.description,
format=self.form
)
return fl_fieldSchema.__init__ method · python · L41-L49 (9 LOC)transfermarkt_datasets/core/schema.py
def __init__(
self,
fields: List[Field] = None,
primary_key: List[str] = None,
foreign_keys: List[str] = None) -> None:
self.fields = fields or []
self.primary_key = primary_key or []
self.foreign_keys = foreign_keys or []Schema.get_fields_by_tag method · python · L60-L66 (7 LOC)transfermarkt_datasets/core/schema.py
def get_fields_by_tag(self, tag: str) -> List[Field]:
matched_tag = [
field for field in self.fields if field.has_tag(tag)
]
return matched_tagRepobility — the code-quality scanner for AI-generated software · https://repobility.com
Schema.as_frictionless_schema method · python · L68-L77 (10 LOC)transfermarkt_datasets/core/schema.py
def as_frictionless_schema(self) -> frictionless.schema.Schema:
fl_fields = [field.as_frictionless_field()
for field in self.fields
]
fl_schema = frictionless.schema.Schema(
fields=fl_fields
)
return fl_schemaread_config function · python · L11-L22 (12 LOC)transfermarkt_datasets/core/utils.py
def read_config(config_file="config.yml") -> Dict: """Read project configuraiton from a yaml file. Args: config_file (str, optional): Path to the config file. Defaults to "config.yml". Returns: Dict: The parsed config in a python dict """ with open(config_file) as config_file: config = yaml.load(config_file, yaml.Loader) return config
seasons_list function · python · L24-L52 (29 LOC)transfermarkt_datasets/core/utils.py
def seasons_list(seasons: str) -> List[str]:
"""Generate a list of seasons to acquire based on the "seasons" string. For example,
for "2012-2014", it should return [2012, 2013, 2014].
Args:
seasons (str): A string representing a date or range of dates to acquire.
Returns:
List[str]: The expanded list of seasons to acquire.
"""
parts = seasons.split("-")
if len(parts) == 0:
raise Exception("Empty string provided for seasons")
elif len(parts) == 1: # single season string
return [int(seasons)]
elif len(parts) == 2: # range of seasons
start, end = parts
season_range = list(range(int(start), int(end) + 1))
if len(season_range) > 20:
raise Exception("The range is too high")
else:
return season_range
else:
raise Exception(f"Invalid string: {seasons}")page 1 / 2next ›