← back to dcaribou__transfermarkt-datasets

Function bodies 51 total

All specs Real LLM only Function bodies
get_player_ids function · python · L41-L60 (20 LOC)
scripts/acquiring/transfermarkt-api.py
def get_player_ids(season: int) -> List[int]:
    """Get the player ids from the players asset from transfermarkt-scraper source.

    Returns:
        List[int]: List of player ids
    """

    players_asset_path = f"data/raw/transfermarkt-scraper/{season}/players.json.gz"

    # read lines from a zipped file
    with gzip.open(players_asset_path, mode="r") as z:
        players = [json.loads(line) for line in z.readlines()]

    player_ids = [
        int(player["href"].split("/")[-1])
        for player in players
    ]
    logging.info(f"Fetched {len(player_ids)} player ids from {players_asset_path}")

    return player_ids
fetch_data function · python · L63-L85 (23 LOC)
scripts/acquiring/transfermarkt-api.py
async def fetch_data(session, url, player_id):
    """Fetch data from the API for a given URL and player ID.

    Args:
        session (aiohttp.ClientSession): The aiohttp session
        url (str): The API URL
        player_id (int): The player ID

    Returns:
        dict: The API response and player ID
    """
    headers = {
        'Content-Type': 'application/json',
        'User-Agent': USER_AGENT
    }

    async with session.get(url=url, headers=headers, ssl=False) as response:
        try:
            json = await response.json()
            return {"response": json, "player_id": player_id}
        except aiohttp.ContentTypeError as e:
            logging.error(f"Failed to fetch data for player {player_id}: {e}")
            return {"response": None, "player_id": player_id}
get_market_values function · python · L88-L106 (19 LOC)
scripts/acquiring/transfermarkt-api.py
async def get_market_values(player_ids: List[int]) -> List[dict]:
    """Get the market value data from the API for each player id.

    Args:
        player_ids (List[int]): List of player ids

    Returns:
        List[dict]: List of dicts with market value data
    """

    logging.info(f"Requesting market values for {len(player_ids)} players")

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_data(session, MARKET_VALUES_API + str(player_id), player_id) for player_id in player_ids]

        # Use asyncio.gather to execute the tasks concurrently
        responses = await asyncio.gather(*tasks)

    return responses
get_transfers function · python · L109-L127 (19 LOC)
scripts/acquiring/transfermarkt-api.py
async def get_transfers(player_ids: List[int]) -> List[dict]:
    """Get the transfer history data from the API for each player id.

    Args:
        player_ids (List[int]): List of player ids

    Returns:
        List[dict]: List of dicts with transfer history data
    """

    logging.info(f"Requesting transfer history for {len(player_ids)} players")

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_data(session, TRANSFERS_API + str(player_id), player_id) for player_id in player_ids]

        # Use asyncio.gather to execute the tasks concurrently
        responses = await asyncio.gather(*tasks)

    return responses
persist_data function · python · L129-L137 (9 LOC)
scripts/acquiring/transfermarkt-api.py
def persist_data(data: List[dict], path: str) -> None:
    """Persist the data to a file.

    Args:
        data (List[dict]): List of dicts with data to persist
        path (str): Path where to store the data
    """
    with open(path, "w") as f:
        f.writelines(json.dumps(item) + "\n" for item in data)
run_for_season function · python · L139-L168 (30 LOC)
scripts/acquiring/transfermarkt-api.py
def run_for_season(season: int) -> None:
    """Run all steps for a given season.

    Args:
        season (int): The season to process
    """
    target_market_values_path = f"data/raw/transfermarkt-api/{season}/market_values.json"
    target_transfers_path = f"data/raw/transfermarkt-api/{season}/transfers.json"

    logging.info(f"Starting player data acquisition for season {season}")

    # create target directories if they do not exist
    pathlib.Path(target_market_values_path).parent.mkdir(parents=True, exist_ok=True)
    pathlib.Path(target_transfers_path).parent.mkdir(parents=True, exist_ok=True)

    # get player IDs for the season
    player_ids = get_player_ids(season)

    # collect market values and transfers for players in SEASON
    market_values = asyncio.run(get_market_values(player_ids))
    transfers = asyncio.run(get_transfers(player_ids))

    # filter out player ids in responses that are not in the original list
    transfers = [item for item in transfers if ite
Asset.all method · python · L66-L74 (9 LOC)
scripts/acquiring/transfermarkt-scraper.py
  def all(cls):
    """Get an ordered list of assets to be acquired.
    Asset acquisition have dependecies between each other. This list returns the right order for asset
    acquisition steps to run.
    """
    assets = [Asset(name) for name in cls.asset_parents if name != 'competitions']
    for asset in assets:
      asset.set_parent()
    return assets
If a scraper extracted this row, it came from Repobility (https://repobility.com)
run_tfmkt function · python · L76-L102 (27 LOC)
scripts/acquiring/transfermarkt-scraper.py
def run_tfmkt(crawler, season=None, parents_file=None):
  """Run a tfmkt CLI command and return its stdout output.

  Args:
      crawler (str): The crawler to run (e.g. 'clubs', 'players', 'confederations').
      season (int, optional): The season year.
      parents_file (str, optional): Path to the parents JSONL file.

  Returns:
      str: The stdout output (JSONL).
  """
  cmd = ["tfmkt", crawler]
  if season is not None:
    cmd.extend(["-s", str(season)])
  if parents_file is not None:
    cmd.extend(["-p", str(parents_file)])

  logging.info(f"Running: {' '.join(cmd)}")
  result = subprocess.run(cmd, capture_output=True, text=True)

  if result.returncode != 0:
    logging.error(f"tfmkt failed with return code {result.returncode}")
    if result.stderr:
      logging.error(f"stderr: {result.stderr}")
    raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)

  return result.stdout
acquire_asset function · python · L104-L122 (19 LOC)
scripts/acquiring/transfermarkt-scraper.py
def acquire_asset(asset, season):
  """Acquire a single asset for a given season using the tfmkt CLI."""
  season_dir = pathlib.Path(f"data/raw/transfermarkt-scraper/{season}")
  season_dir.mkdir(parents=True, exist_ok=True)

  parent_file = asset.parent.file_full_path(season) if asset.parent else None
  output_file = asset.file_path(season)

  logging.info(f"Acquiring {asset.name} for season {season}")
  output = run_tfmkt(asset.name, season=season, parents_file=parent_file)

  # Remove existing file if present
  if output_file.exists():
    os.remove(str(output_file))

  # Gzip and write output
  with gzip.open(str(output_file), 'wt') as f:
    f.write(output)
  logging.info(f"Wrote {output_file}")
acquire_on_local function · python · L124-L141 (18 LOC)
scripts/acquiring/transfermarkt-scraper.py
def acquire_on_local(asset, seasons):

  def assets_list(asset: str) -> List[Asset]:
    if asset == 'all':
      assets = Asset.all()
    else:
      asset_obj = Asset(name=asset)
      asset_obj.set_parent()
      assets = [asset_obj]

    return assets

  expanded_seasons = seasons_list(seasons)
  expanded_assets = assets_list(asset)

  for season in expanded_seasons:
    for asset_obj in expanded_assets:
      acquire_asset(asset_obj, season)
publish_to_dataworld function · python · L15-L83 (69 LOC)
scripts/synching/sync-dataworld.py
def publish_to_dataworld(folder):
  """Push the contents of the folder to data.world's dataset dcereijo/player-scores
  :param folder: dataset folder path
  """

  with open(folder + '/dataset-metadata.json') as metadata_file:
    metadata = json.load(metadata_file)

  dw_files = []
  for resource in metadata['resources']:
    filename = resource['path']
    if not filename.endswith('.gz'):
      filename += '.gz'

    url = f'{R2_PUBLIC_URL}/{filename}'

    dw_files.append(
      {
        'name': resource['title'],
        'description': (resource['description'])[0:120],
        'labels': ['clean data'],
        'source': {
          'url': url
        }
      }
    )

  metadata['summary'] = metadata['description']
  metadata['description'] = "Clean, structured and automatically updated football (soccer) data from Transfermarkt"
  metadata['tags'] = metadata['keywords']
  metadata['license'] = metadata['licenses'][0]['CC0']

  metadata['visibility'] = 'OPEN'

  del metadata['keywor
publish_to_kaggle function · python · L11-L24 (14 LOC)
scripts/synching/sync-kaggle.py
def publish_to_kaggle(folder, message):
  """Push the contents of the folder to Kaggle datasets
  :param folder: dataset folder path
  :param message: a string message with version notes
  """
    
  api = KaggleApi()
  api.authenticate()

  # https://github.com/Kaggle/kaggle-api/blob/master/kaggle/api/kaggle_api_extended.py#L1317
  api.dataset_create_version(
    folder=folder,
    version_notes=message
  )
sync_to_r2 function · python · L22-L41 (20 LOC)
scripts/synching/sync-r2.py
def sync_to_r2(prep_dir):
    s3 = boto3.client(
        "s3",
        endpoint_url=R2_ENDPOINT,
        aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    )

    for filepath in sorted(glob.glob(os.path.join(prep_dir, "*.csv.gz"))):
        filename = os.path.basename(filepath)
        key = f"{R2_PREFIX}/{filename}"
        print(f"  {filename} -> s3://{R2_BUCKET}/{key}")
        s3.upload_file(filepath, R2_BUCKET, key)

    for extra in ["dataset-metadata.json", "transfermarkt-datasets.zip"]:
        path = os.path.join(prep_dir, extra)
        if os.path.exists(path):
            key = f"{R2_PREFIX}/{extra}"
            print(f"  {extra} -> s3://{R2_BUCKET}/{key}")
            s3.upload_file(path, R2_BUCKET, key)
top_n_players function · python · L10-L15 (6 LOC)
streamlit/pages/03_💰_app:_player_value.py
def top_n_players(df: pd.DataFrame, n: int) -> List[str]:
    return (df
        .sort_values(by="market_value_in_eur", ascending=False)
        .head(n)["name"]
        .values
    )
load_td function · python · L20-L33 (14 LOC)
streamlit/utils.py
def load_td() -> Dataset:
    """Instantiate and initialise a Dataset, so it can be used in the app.

    Returns:
        Dataset: A transfermark_datasets.core.Dataset that is initialised and ready to be used.
    """

    if os.environ["STREAMLIT"] == "cloud":
        os.system("dvc pull data/prep")

    td = Dataset()
    td.load_assets()

    return td
Repobility — the code-quality scanner for AI-generated software · https://repobility.com
read_file_contents function · python · L35-L44 (10 LOC)
streamlit/utils.py
def read_file_contents(file_path: str):
    """Read a markdown file in disk as a string.

    Args:
        markdown_file (str): The path of the file to be read.

    Returns:
        str: The contents of the file as a string.
    """
    return Path(file_path).read_text()
draw_dataset_index function · python · L46-L61 (16 LOC)
streamlit/utils.py
def draw_dataset_index(td: Dataset) -> None:

    md_index_lines = []

    for asset_name, asset in td.assets.items():
        if asset.public:
            titelized_asset_name = titleize(asset.frictionless_resource_name).lower()
            asset_anchor = dasherize(asset.frictionless_resource_name).lower()
            md_index_line = f"* [{titelized_asset_name}](#{asset_anchor})"
            md_index_lines.append(
                md_index_line
            )

    st.sidebar.markdown(
        "\n".join(md_index_lines)
    )
draw_asset function · python · L63-L90 (28 LOC)
streamlit/utils.py
def draw_asset(asset: Asset) -> None:
    """Draw a transfermarkt-dataset asset summary

    Args:
        asset_name (str): Name of the asset
    """

    left_col, right_col = st.columns([5,1])

    title = titleize(asset.frictionless_resource_name).lower()
    left_col.subheader(title)

    left_col.markdown(asset.description)
    delta = get_records_delta(asset)
    right_col.metric(
        label="# of records",
        value=len(asset.prep_df),
        delta=delta,
        help="Total number of records in the asset / New records in the past week"
    )

    with st.expander("Attributes"):
        draw_asset_schema(asset)

    with st.expander("Explore"):
        draw_asset_explore(asset)

    st.markdown("---")
draw_asset_explore function · python · L92-L146 (55 LOC)
streamlit/utils.py
def draw_asset_explore(asset: Asset) -> None:
    """Draw dataframe together with dynamic filters for exploration.

    Args:
        asset (Asset): The asset to draw the explore for.
    """
    
    tagged_columns = [
        field.name
        for field in asset.schema.get_fields_by_tag("explore")
    ]
    default_columns = list(asset.prep_df.columns[:4].values)

    if len(tagged_columns) > 0:
        columns = tagged_columns
    else:
        columns = default_columns

    filter_columns = st.multiselect(
        label="Search by",
        options=asset.prep_df.columns,
        default=columns
    )
    if len(filter_columns) == 0:
        filter_columns = columns

    st_cols = st.columns(len(filter_columns))

    df = asset.prep_df.copy()

    for st_col, at_col in zip(st_cols, filter_columns):

        options = list(df[at_col].unique())
 
        selected = st_col.selectbox(
            label=at_col,
            options=options,
            key=(asset.name + "-" + at_col)
   
render_svg function · python · L156-L186 (31 LOC)
streamlit/utils.py
def render_svg(svg, caption):
    """Renders the given svg string."""
    b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
    html_style = """
    <style>
        figure {
            border: 1px #cccccc solid;
            padding: 4px;
            margin: auto;
        }
        figcaption {
            background-color: black;
            color: white;
            font-style: italic;
            padding: 2px;
            text-align: center;
        }
    </style>
    """
    html_image = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
    html_caption = f"<figcaption>{caption}</figcaption>"
    html_figure = f"""
    {html_style}
    <figure>
    {html_image}
    {html_caption}
    </figure>
    
    &nbsp;
    """
    st.write(html_figure, unsafe_allow_html=True)
get_records_delta function · python · L194-L211 (18 LOC)
streamlit/utils.py
def get_records_delta(asset: Asset, offset: int = 7) -> int:
    """Get an asset records' delta (number of new records in last n days).

    Args:
        asset (Asset): The asset to be calculating the delta from.
        offset (int, optional): Number in days to be consider for the delta calculation. Defaults to 7.

    Returns:
        int: Number of records.
    """
    df = asset.prep_df

    if "date" in df.columns:
        dt = pd.to_datetime(df["date"])
        delta = len(df[dt > (datetime.now() - timedelta(days=offset))])
        return delta
    else:
        return None
CurAppearancesAsset.__init__ method · python · L17-L50 (34 LOC)
transfermarkt_datasets/assets/cur_appearances.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(name="appearance_id", type="integer"),
        Field(name="game_id", type="integer"),
        Field(name="player_id", type="integer"),
        Field(
          name="player_club_id",
          type="integer",
          description="ID of the club that the player belonged to at the time of the game."
        ),
        Field(
          name="player_current_club_id",
          type="integer",
          description="ID of the club that the player currently belongs to."
        ),
        Field(name="date", type="date", tags=["explore"]),
        Field(name="player_name", type="string", tags=["explore"]),
        Field(name="competition_id", type="string"),
        Field(name="yellow_cards", type="integer"),
        Field(name="red_cards", type="integer"),
        Field(name="goals", type="integer"),
        Field(name="assists", type="integer"),
    
CurClubGamesAsset.__init__ method · python · L35-L65 (31 LOC)
transfermarkt_datasets/assets/cur_club_games.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema()

    self.schema.add_field(Field(name='club_id', type='integer'))
    self.schema.add_field(Field(name='game_id', type='integer'))
    self.schema.add_field(Field(name='own_goals', type='integer'))
    self.schema.add_field(Field(name='own_position', type='integer'))
    self.schema.add_field(Field(name='own_manager_name', type="string", tags=["explore"]))
    self.schema.add_field(Field(name='opponent_id', type='integer'))
    self.schema.add_field(Field(name='opponent_goals', type='integer'))
    self.schema.add_field(Field(name='opponent_position', type='integer'))
    self.schema.add_field(Field(name='opponent_manager_name', type='string'))
    self.schema.add_field(Field(
      name="hosting",
      type="string",
      description="'Home' if the game took place at the club home stadium and 'Away' if at its opponent stadium"
    ))
    self.schema.add_field(Field(
     
Powered by Repobility — scan your code at https://repobility.com
CurClubsAsset.__init__ method · python · L19-L60 (42 LOC)
transfermarkt_datasets/assets/cur_clubs.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema()

    self.schema.add_field(Field(name='club_id', type='integer'))
    self.schema.add_field(Field(name='club_code', type='string'))
    self.schema.add_field(Field(name='name', type='string'))
    self.schema.add_field(Field(name='domestic_competition_id', type='string', tags=["explore"]))
    self.schema.add_field(Field(
        name='total_market_value',
        type='number',
        description="Aggregated players' Transfermarkt market value in millions of pounds"
      )
    )
    self.schema.add_field(Field(name='squad_size', type='integer'))
    self.schema.add_field(Field(name='average_age', type='number'))
    self.schema.add_field(Field(name='foreigners_number', type='integer'))
    self.schema.add_field(Field(name='foreigners_percentage', type='number'))
    self.schema.add_field(Field(name='national_team_players', type='integer'))
    self.schema.add_field(Field(
CurCompetitionsAsset.__init__ method · python · L16-L43 (28 LOC)
transfermarkt_datasets/assets/cur_competitions.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema()

    self.schema.add_field(Field(name="competition_id", type="string"))
    self.schema.add_field(Field(name="competition_code", type="string"))
    self.schema.add_field(Field(name="name", type="string"))
    self.schema.add_field(Field(name="type", type="string"))
    self.schema.add_field(Field(name="sub_type", type="string"))
    self.schema.add_field(Field(
      name="is_major_national_league",
      type="boolean",
      description="Competition is a major national league in the confederation."
      )
    )
    self.schema.add_field(Field(name="country_id", type="integer"))
    self.schema.add_field(Field(name="country_name", type="string"))
    self.schema.add_field(Field(name="domestic_league_code", type="string"))
    self.schema.add_field(Field(name="confederation", type="string", tags=["explore"]))
    self.schema.add_field(Field(
        name="url",
        typ
CurGameEventsAsset.__init__ method · python · L15-L55 (41 LOC)
transfermarkt_datasets/assets/cur_game_events.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(
          name='game_event_id',
          type='string',
          description="Surrogate key"
        ),
        Field(name='date', type='date'),
        Field(name='game_id', type='integer'),
        Field(name='player_id', type='integer'),
        Field(name='club_id', type='integer'),
        Field(name='club_name', type='string'),
        Field(name='type', type='string'),
        Field(name='minute', type='integer'),
        Field(name='description', type='string'),
        Field(
          name='player_in_id',
          type='string',
          description="For subsitution events, ID of the player who joins the game. Null otherwise"
        ),
        Field(
          name='player_assist_id',
          type='string',
          description="For goal events, ID of the player who did the assist. Null otherwise"
        ),
      ]
    )

    s
CurGameLineupsAsset.__init__ method · python · L13-L39 (27 LOC)
transfermarkt_datasets/assets/cur_game_lineups.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(
          name='game_lineups_id',
          type='string',
          description="Surrogate key"
        ),
        Field(name='game_id', type='integer'),
        Field(name='player_id', type='integer'),
        Field(name='club_id', type='integer'),
        Field(name='type', type='string'),
        Field(name='player_name', type='string'),
        Field(name='team_captain', type='string'),
        Field(name='number', type='string'),
        Field(name='position', type='string'),
        Field(name='date', type='date'),
      ]
    )

    self.schema.primary_key = [
      'game_id',
      'player_id',
      'club_id',
    ]
CurGamesAsset.__init__ method · python · L14-L49 (36 LOC)
transfermarkt_datasets/assets/cur_games.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(name='game_id', type='integer'),
        Field(name='competition_id', type='string', tags=["explore"]),
        Field(name='competition_type', type='string'),
        Field(name='season', type='integer', tags=["explore"]),
        Field(name='round', type='string', tags=["explore"]),
        Field(name='date', type='date', tags=["explore"]),
        Field(name='home_club_id', type='integer'),
        Field(name='away_club_id', type='integer'),
        Field(name='home_club_goals', type='integer'),
        Field(name='away_club_goals', type='integer'),
        Field(name='aggregate', type='string'),
        Field(name='home_club_position', type='integer'),
        Field(name='away_club_position', type='integer'),
        Field(name='home_club_name', type='string', tags=["explore"]),
        Field(name='away_club_name', type='string', tags=["explore
CurPlayersAsset.__init__ method · python · L21-L72 (52 LOC)
transfermarkt_datasets/assets/cur_players.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(name="player_id", type="integer"),
        Field(name="name", type="string"),
        Field(name="current_club_id", type="integer"),
        Field(name="current_club_name", type="string", tags=["explore"]),
        Field(name="country_of_citizenship", type="string"),
        Field(name="country_of_birth", type="string"),
        Field(name="city_of_birth", type="string"),
        Field(name="date_of_birth", type="date"),
        Field(name="position", type="string"),
        Field(name="sub_position", type="string"),
        Field(name="foot", type="string"),
        Field(name="height_in_cm", type="integer"),
        Field(
          name="market_value_in_eur",
          type="number",
          description="The player's current market value in EUR."
        ),
        Field(
          name="highest_market_value_in_eur",
          type="number",
CurPlayerValuationsAsset.__init__ method · python · L17-L38 (22 LOC)
transfermarkt_datasets/assets/cur_player_valuations.py
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)

    self.schema = Schema(
      fields=[
        Field(name='date', type='date'),
        Field(name='player_id', type='integer'),
        Field(name='current_club_name', type='string'),
        Field(name='current_club_id', type='integer'),
        Field(name='market_value_in_eur', type='number'),
        Field(
          name='player_club_domestic_competition_id',
          type='string',
          tags=["explore"]
        )
      ]
    )

    self.schema.primary_key = ['player_id', 'date']
    self.schema.foreign_keys = [
      {"fields": "player_id", "reference": {"resource": "cur_players", "fields": "player_id"}}
    ]
CurTransfersAsset.__init__ method · python · L15-L44 (30 LOC)
transfermarkt_datasets/assets/cur_transfers.py
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.schema = Schema(
            fields=[
                Field(name="player_id", type="integer"),
                Field(name="player_name", type="string"),
                Field(name="transfer_date", type="date"),
                Field(name="transfer_season", type="string"),
                Field(name="from_club_id", type="integer"),
                Field(name="to_club_id", type="integer"),
                Field(name="from_club_name", type="string", tags=["explore"]),
                Field(name="to_club_name", type="string", tags=["explore"]),
                Field(
                    name="transfer_fee",
                    type="number",
                    description="The transfer fee in EUR. Null if unknown, 0 if free transfer."
                ),
                Field(
                    name="market_value_in_eur",
                    type="number",
                    descript
Want fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
Asset.__init__ method · python · L27-L41 (15 LOC)
transfermarkt_datasets/core/asset.py
  def __init__(
    self,
    settings: dict = None) -> None:

      self._prep_df = None
      self.settings = settings
      self.log = logging.getLogger("main")
      self.prep_location = "data/prep"
      self.datapackage_descriptor_path = f"{self.prep_location}/dataset-metadata.json"

      if not self.file_name:
        file_name = self.name.replace("base_", "")
        self.file_name = file_name + ".csv"

      self.schema = Schema()
Asset.prep_df method · python · L51-L69 (19 LOC)
transfermarkt_datasets/core/asset.py
  def prep_df(self, df):

    df_type = type(df)
    if df_type != pd.DataFrame:
      raise InvalidPreparedDF(f"Invalid df type: {df_type}")
    else:
      df_cols = list(df.columns.values)
      df_cols_set = set(df_cols)
      schema_cols_set = set(self.schema.field_names)
      set_difference = (df_cols_set - schema_cols_set).union(
        schema_cols_set - df_cols_set
      )
      if set_difference != set():
        raise InvalidPreparedDF(
          f"{self.name}: fields do not match provided schema: {set_difference}"
        )

    field_names = self.schema.field_names
    self._prep_df = df[field_names]
Asset.load_from_prep method · python · L87-L92 (6 LOC)
transfermarkt_datasets/core/asset.py
  def load_from_prep(self):
    """Load prepared dataset from the local to a pandas dataframe.
    """
    self.prep_df = pd.read_csv(
      filepath_or_buffer=self.prep_path
    )
Asset.schema_as_dataframe method · python · L105-L129 (25 LOC)
transfermarkt_datasets/core/asset.py
  def schema_as_dataframe(self) -> pd.DataFrame:
    """Render the asset schema as a pandas dataframe.

    Returns:
        pd.DataFrame: A pandas dataframe representing the asset schema.
    """

    fields = [field.name for field in  self.schema.fields]
    types = [field.type for field in  self.schema.fields]
    descriptions = [field.description for field in  self.schema.fields]
    sample_values = [
      get_sample_values(self.prep_df, field.name, 3)
      for field in self.schema.fields
    ]

    df = pd.DataFrame(
      data=dict(
        description=descriptions,
        type=types,
        sample_values=sample_values
      ),
      index=fields
    )
    
    return df
Asset.as_frictionless_resource method · python · L131-L142 (12 LOC)
transfermarkt_datasets/core/asset.py
  def as_frictionless_resource(self) -> Resource:

    detector = Detector(schema_sync=True)
    resource = Resource(
      title=self.frictionless_resource_name,
      path=self.file_name_uncompressed,
      detector=detector,
      description=self.description,
      schema=self.schema.as_frictionless_schema()
    )

    return resource
RawAsset.__init__ method · python · L148-L156 (9 LOC)
transfermarkt_datasets/core/asset.py
  def __init__(self, settings: dict = None) -> None:
    super().__init__(settings)

    self.raw_df = None
    self.raw_files_path = "data/raw/transfermarkt-scraper"

    if not self.raw_file_name:
      file_name = self.name.replace("base_", "")
      self.raw_file_name = file_name + ".json.gz"
RawAsset.load_raw method · python · L158-L188 (31 LOC)
transfermarkt_datasets/core/asset.py
  def load_raw(self):

    raw_dfs = []

    if "competitions" in self.raw_file_name:
        df = pd.read_json(
          f"data/competitions.json",
          lines=True,
          convert_dates=True,
          orient={'index', 'date'}
        )
        raw_dfs.append(df)
    else:
      seasons = read_config()["defintions"]["seasons"]
      for season in seasons:

        season_file = f"{self.raw_files_path}/{season}/{self.raw_file_name}"

        self.log.debug("Reading raw data from %s", season_file)
        df = pd.read_json(
          season_file,
          lines=True,
          convert_dates=True,
          orient={'index', 'date'}
        )
        df["season"] = season
        df["season_file"] = season_file
        if len(df) > 0:
          raw_dfs.append(df)

    self.raw_df = pd.concat(raw_dfs, axis=0)
Dataset.__init__ method · python · L26-L54 (29 LOC)
transfermarkt_datasets/core/dataset.py
  def __init__(
    self,
    config=None,
    config_file="config.yml",
    assets_root=".",
    assets_relative_path="transfermarkt_datasets/assets",

    ) -> None:

      self.assets_root = assets_root
      self.assets_relative_path = assets_relative_path

      self.config = config or read_config(config_file)

      self.prep_folder_path = "data/prep"
      self.assets = {}

      if self.config.get("logging"):
        logging.config.dictConfig(self.config["logging"])
      else:
        logging.basicConfig()

      self.log = logging.getLogger("main")

      for file in pathlib.Path(os.path.join(self.assets_root, self.assets_relative_path)).glob("**/*.py"):
        filename = file.name
        class_ = self.get_asset_def(filename.split(".")[0])
        asset = class_()
        self.assets[asset.name] = asset
If a scraper extracted this row, it came from Repobility (https://repobility.com)
Dataset.load_assets method · python · L69-L74 (6 LOC)
transfermarkt_datasets/core/dataset.py
  def load_assets(self):
    """Load all assets in the dataset from local.
    """
    for asset_name, asset in self.assets.items():
      if asset.public:
        asset.load_from_prep()
Dataset.get_relationships method · python · L82-L107 (26 LOC)
transfermarkt_datasets/core/dataset.py
  def get_relationships(self) -> List[Dict]:
    """Get assets relationships.
    Relationships are defined by the assets set of foreign keys.

    Returns:
        List[Dict]: A list of relationships (source -> target)
    """
    relationships = []

    for asset_name, asset in self.assets.items():
      if asset.schema.foreign_keys:
        for foreign_key in asset.schema.foreign_keys:
          reference = foreign_key["reference"]
          relationship = {
            "from": asset_name,
            "to": reference["resource"],
            "on": {
              "source": foreign_key["fields"],
              "target": reference["fields"]
            }
          }
          relationships.append(
            relationship
          )

    return relationships
Dataset.as_frictionless_package method · python · L109-L138 (30 LOC)
transfermarkt_datasets/core/dataset.py
  def as_frictionless_package(self, basepath=None, exclude_private=False) -> None:
    """Create an save to local a file descriptor tha defines a "datapackage" for this dataset.

    Args:
        basepath (str, optional): Base path of prepared files. It defaults to the "prep" folder path.
    """
    base_path = basepath or self.prep_folder_path
    package = Package(basepath=base_path)

    # full spec at https://specs.frictionlessdata.io/data-package/
    package.title = "Football Data from Transfermarkt"
    package.description = "Clean, structured and automatically updated football (soccer) data from Transfermarkt"
    package.keywords = [
      "football", "players", "stats", "statistics", "data",
      "soccer", "games", "matches"
    ]
    package.id = "davidcariboo/player-scores"
    package.licenses = [{
      "CC0": "Public Domain"
    }]

    with open('transfermarkt_datasets/datapackage_description.md') as datapackage_description_file:
      package.description = datapacka
Dataset.write_datapackage method · python · L140-L150 (11 LOC)
transfermarkt_datasets/core/dataset.py
  def write_datapackage(self):
    pkg = self.as_frictionless_package()
    pkg_as_json = json.loads(pkg.to_json())

    # recursively sort a json object by key
    def sort_dict_by_key(d):
      return {k: sort_dict_by_key(v) if isinstance(v, dict) else v for k, v in sorted(d.items())}
    
    # write the sorted json to a file
    with open("data/prep/dataset-metadata.json", "w") as f:
      json.dump(sort_dict_by_key(pkg_as_json), f, indent=2)
Field.__init__ method · python · L7-L19 (13 LOC)
transfermarkt_datasets/core/schema.py
    def __init__(
        self,
        name: str,
        type: str,
        description: str = None,
        tags: List[str] = None,
        form: str = None) -> None:
        
        self.name = name
        self.type = type
        self.description = description
        self.form = form
        self.tags = tags or []
Field.as_frictionless_field method · python · L24-L32 (9 LOC)
transfermarkt_datasets/core/schema.py
    def as_frictionless_field(self) -> frictionless.Field:
        fl_field = frictionless.Field(
            name=self.name,
            type=self.type,
            description=self.description,
            format=self.form
        )

        return fl_field
Schema.__init__ method · python · L41-L49 (9 LOC)
transfermarkt_datasets/core/schema.py
    def __init__(
        self,
        fields: List[Field] = None,
        primary_key: List[str] = None,
        foreign_keys: List[str] = None) -> None:

        self.fields = fields or []
        self.primary_key = primary_key or []
        self.foreign_keys = foreign_keys or []
Schema.get_fields_by_tag method · python · L60-L66 (7 LOC)
transfermarkt_datasets/core/schema.py
    def get_fields_by_tag(self, tag: str) -> List[Field]:

        matched_tag = [
            field for field in self.fields if field.has_tag(tag)
        ]

        return matched_tag
Repobility — the code-quality scanner for AI-generated software · https://repobility.com
Schema.as_frictionless_schema method · python · L68-L77 (10 LOC)
transfermarkt_datasets/core/schema.py
    def as_frictionless_schema(self) -> frictionless.schema.Schema:

        fl_fields = [field.as_frictionless_field()
            for field in self.fields
        ]
        fl_schema = frictionless.schema.Schema(
            fields=fl_fields
        )
        
        return fl_schema
read_config function · python · L11-L22 (12 LOC)
transfermarkt_datasets/core/utils.py
def read_config(config_file="config.yml") -> Dict:
	"""Read project configuraiton from a yaml file.

	Args:
			config_file (str, optional): Path to the config file. Defaults to "config.yml".

	Returns:
			Dict: The parsed config in a python dict
	"""
	with open(config_file) as config_file:
		config = yaml.load(config_file, yaml.Loader)
		return config
seasons_list function · python · L24-L52 (29 LOC)
transfermarkt_datasets/core/utils.py
def seasons_list(seasons: str) -> List[str]:
	"""Generate a list of seasons to acquire based on the "seasons" string. For example,
	for "2012-2014", it should return [2012, 2013, 2014].

	Args:
		seasons (str): A string representing a date or range of dates to acquire.

	Returns:
		List[str]: The expanded list of seasons to acquire.
	"""
	parts = seasons.split("-")

	if len(parts) == 0:
		raise Exception("Empty string provided for seasons")

	elif len(parts) == 1: # single season string
		return [int(seasons)]

	elif len(parts) == 2: # range of seasons
		start, end = parts
		season_range = list(range(int(start), int(end) + 1))

		if len(season_range) > 20:
			raise Exception("The range is too high")
		else:
			return season_range

	else:
		raise Exception(f"Invalid string: {seasons}")
page 1 / 2next ›