sapp/models.py (1,069 lines of code) (raw):
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# pyre-unsafe
from __future__ import annotations
import enum
import json
import logging
from datetime import datetime
from decimal import Decimal
from itertools import islice
from typing import Any, Dict, List, NamedTuple, Optional, Set, Type, Union
from graphene_sqlalchemy.converter import (
convert_column_to_int_or_id,
convert_column_to_string,
convert_sqlalchemy_type,
)
from sqlalchemy import (
Boolean,
Column,
DateTime,
Enum,
Float,
Index,
Integer,
String,
func,
types,
JSON,
)
from sqlalchemy.dialects.mysql import BIGINT
from sqlalchemy.exc import NoSuchTableError
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, relationship
from .db import DB
from .db_support import (
DBID,
BIGDBIDType,
DBIDType,
MutableRecordMixin,
PrepareMixin,
PrimaryKeyBase,
PrimaryKeyGeneratorBase,
RecordMixin,
)
from .decorators import classproperty
from .pipeline import SourceLocation
log: logging.Logger = logging.getLogger("sapp")
Base = declarative_base()
INNODB_MAX_INDEX_LENGTH = 767
HANDLE_LENGTH = 255
MESSAGE_LENGTH = 4096
SHARED_TEXT_LENGTH = 4096
"""Models used to represent DB entries
An Issue is a particular problem found. It can exist across multiple commits. A
Run is a single run of Zoncolan over a specific commit. It may find new Issues,
or existing Issues. Each run is tied to Issues through IssueInstances.
IssueInstances have per run information, like source location, while Issues have
attributes like the status of an issue.
"""
class LeafMapping(NamedTuple):
caller_leaf: int
callee_leaf: int
transform: int
class SourceLocationType(types.TypeDecorator):
"""Defines a new type of SQLAlchemy to store source locations.
In python land we use SourceLocation, but when stored in the databae we just
split the fields with |
"""
impl = types.String
cache_ok = False
def __init__(self) -> None:
super(SourceLocationType, self).__init__(length=255)
def process_bind_param(self, value, dialect):
"""
SQLAlchemy uses this to convert a SourceLocation object into a string.
"""
if value is None:
return None
return SourceLocation.to_string(value)
def process_result_value(self, value, dialect) -> Optional[SourceLocation]:
"""
SQLAlchemy uses this to convert a string into a SourceLocation object.
We separate the fields by a |
"""
if value is None:
return None
p = value.split("|")
if len(p) == 0:
return None
return SourceLocation.of(*map(int, p))
class SourceLocationsType(types.TypeDecorator):
"""Defines a type to store multiple source locations in a single string"""
impl = types.String
cache_ok = False
def __init__(self) -> None:
super(SourceLocationsType, self).__init__(length=4096)
def process_bind_param(self, value, dialect) -> Optional[str]:
if value is None:
return None
return ",".join([SourceLocation.to_string(location) for location in value])
def process_result_value(self, value: str, dialect):
if value is None or value == "":
return []
assert isinstance(value, str), "Invalid SourceLocationsType %s" % str(value)
locations = value.split(",")
return [SourceLocation.from_string(location) for location in locations]
# See Issue.merge for information about replace_assocs
class IssueDBID(DBID):
__slots__ = ["replace_assocs"]
def __init__(self, id=None) -> None:
super().__init__(id)
self.replace_assocs = False
class IssueDBIDType(DBIDType):
def process_result_value(self, value, dialect) -> IssueDBID:
return IssueDBID(value)
class IssueBIGDBIDType(BIGDBIDType):
def process_result_value(self, value, dialect) -> IssueDBID:
return IssueDBID(value)
class IssueInstanceTraceFrameAssoc(Base, PrepareMixin, RecordMixin):
__tablename__ = "issue_instance_trace_frame_assoc"
issue_instance_id = Column(
"issue_instance_id", BIGDBIDType, primary_key=True, nullable=False
)
trace_frame_id = Column(
"trace_frame_id", BIGDBIDType, primary_key=True, nullable=False, index=True
)
issue_instance = relationship(
"IssueInstance",
primaryjoin=(
"IssueInstanceTraceFrameAssoc.issue_instance_id == "
"foreign(IssueInstance.id)"
),
uselist=False,
viewonly=True,
)
trace_frame = relationship(
"TraceFrame",
primaryjoin=(
"IssueInstanceTraceFrameAssoc.trace_frame_id == foreign(TraceFrame.id)"
),
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(
session, items, cls.issue_instance_id, cls.trace_frame_id
)
class SharedTextKind(enum.Enum):
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
feature = enum.auto()
message = enum.auto()
source = enum.auto()
sink = enum.auto()
callable = enum.auto()
filename = enum.auto()
source_detail = enum.auto()
sink_detail = enum.auto()
@classproperty
def FEATURE(cls) -> "SharedTextKind": # noqa
return cls.feature
@classproperty
def MESSAGE(cls) -> "SharedTextKind": # noqa
return cls.message
@classproperty
def SOURCE(cls) -> "SharedTextKind": # noqa
return cls.source
@classproperty
def SINK(cls) -> "SharedTextKind": # noqa
return cls.sink
@classproperty
def CALLABLE(cls) -> "SharedTextKind": # noqa
return cls.callable
@classproperty
def FILENAME(cls) -> "SharedTextKind": # noqa
return cls.filename
@classproperty
def SOURCE_DETAIL(cls) -> "SharedTextKind": # noqa
return cls.source_detail
@classproperty
def SINK_DETAIL(cls) -> "SharedTextKind": # noqa
return cls.sink_detail
@classmethod
def from_string(cls, string: str) -> Optional[SharedTextKind]:
return cls.__members__.get(string)
class Feature(Base, PrepareMixin, RecordMixin):
"""
Features (also known as breadcrumbs) are structured
output from SAPP tools that allow for tagging issues and
traces with metadata, that can later be used for querying
and filtering said issues and traces.
"""
__tablename__ = "features"
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, primary_key=True)
features_issue_instance = relationship(
"IssueInstanceFeatureAssoc",
primaryjoin=("Feature.id == foreign(IssueInstanceFeatureAssoc.feature_id)"),
viewonly=True,
)
data: Column[Union[List[Any], Dict[str, Any]]] = Column(
JSON(), nullable=False, index=False
)
@classmethod
def merge(cls, session, features):
return cls._merge_by_keys(
session,
features,
lambda feature: json.dumps(getattr(feature, cls.data.key), sort_keys=True),
cls.data,
)
class SharedText(Base, PrepareMixin, RecordMixin):
"""Any string-ish type that can be shared as a property of some other
object. (e.g. features, sources, sinks). The table name 'messages' is due
to legacy reasons."""
__tablename__ = "messages"
__table_args__ = (
Index("ix_messages_handle", "contents", "kind", mysql_length={"contents": 767}),
)
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, primary_key=True)
# pyre-fixme[8]: Attribute has type `str`; used as `Column[str]`.
contents: str = Column(
String(length=SHARED_TEXT_LENGTH),
nullable=False,
)
# pyre-fixme[8]: Attribute has type `SharedTextKind`; used as `Column[str]`.
kind: SharedTextKind = Column(
Enum(SharedTextKind), server_default="feature", nullable=False, index=True
)
issue_instances = association_proxy("shared_text_issue_instance", "issue_instance")
shared_text_issue_instance = relationship(
"IssueInstanceSharedTextAssoc",
primaryjoin=(
"SharedText.id == foreign(IssueInstanceSharedTextAssoc.shared_text_id)"
),
viewonly=True,
)
trace_frames = association_proxy("shared_text_trace_frame", "trace_frames")
shared_text_trace_frame = relationship(
"TraceFrameLeafAssoc",
primaryjoin=("SharedText.id == foreign(TraceFrameLeafAssoc.leaf_id)"),
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_by_keys(
session,
items,
lambda item: "%s:%s" % (item.contents, item.kind),
cls.contents,
cls.kind,
)
class IssueInstanceSharedTextAssoc(Base, PrepareMixin, RecordMixin):
"""Assoc table between issue instances and its properties that are
representable by a string. The DB table name and column names are due to
legacy reasons and warrant some explanation:
- 'Features' used to be the only shared text of the assoc, now, the assoc
also accounts for 'Sources' and 'Sinks' and possibly more.
- 'messages' table used to be only for 'messages', now, it contains
features, sources and sinks and possibly more.
- It is expensive to rename the DB tables, so renaming only happened in
the model. This is why it looks like we have 3 different terms for the
same thing: 'messages', 'shared_text', 'features'.
When in doubt, trust the property and method names used in the model and
refer to the relationship joins for how objects relate to each other.
"""
__tablename__ = "issue_instance_feature_assoc"
issue_instance_id = Column(
"issue_instance_id", BIGDBIDType, primary_key=True, nullable=False
)
shared_text_id = Column("feature_id", BIGDBIDType, primary_key=True, nullable=False)
issue_instance = relationship(
"IssueInstance",
primaryjoin=(
"IssueInstanceSharedTextAssoc.issue_instance_id =="
"foreign(IssueInstance.id)"
),
uselist=False,
viewonly=True,
)
shared_text = relationship(
"SharedText",
primaryjoin=(
"IssueInstanceSharedTextAssoc.shared_text_id == foreign(SharedText.id)"
),
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(
session, items, cls.issue_instance_id, cls.shared_text_id
)
class IssueInstanceFeatureAssoc(Base, PrepareMixin, RecordMixin):
__tablename__ = "issue_instance_structured_features_assoc"
issue_instance_id = Column(
"issue_instance_id", BIGDBIDType, primary_key=True, nullable=False
)
feature_id = Column("feature_id", BIGDBIDType, primary_key=True, nullable=False)
issue_instance = relationship(
"IssueInstance",
primaryjoin=(
"IssueInstanceFeatureAssoc.issue_instance_id ==" "foreign(IssueInstance.id)"
),
uselist=False,
viewonly=True,
)
features = relationship(
"Feature",
primaryjoin=("IssueInstanceFeatureAssoc.feature_id == foreign(Feature.id)"),
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(session, items, cls.issue_instance_id, cls.feature_id)
class TraceKind(enum.Enum):
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
precondition = enum.auto()
postcondition = enum.auto()
@classproperty
def PRECONDITION(cls) -> "TraceKind": # noqa
return cls.precondition
@classproperty
def POSTCONDITION(cls) -> "TraceKind": # noqa
return cls.postcondition
@classmethod
def create_from_string(cls, value: str) -> TraceKind:
if value == "precondition":
return cls.precondition
if value == "postcondition":
return cls.postcondition
raise ValueError(f"`{value}` is not a valid `TraceKind`")
class IssueInstance(Base, PrepareMixin, MutableRecordMixin):
"""A particularly instance of an issue found in a run"""
__tablename__ = "issue_instances"
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, primary_key=True)
location = Column(
SourceLocationType,
nullable=False,
doc="Location (possibly a range) of the issue",
)
filename_id = Column(BIGDBIDType, nullable=False, server_default="0", default=0)
filename = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == IssueInstance.filename_id",
uselist=False,
viewonly=True,
)
callable_id = Column(BIGDBIDType, nullable=False, server_default="0", default=0)
callable = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == IssueInstance.callable_id",
uselist=False,
viewonly=True,
)
is_new_issue: Column[Optional[bool]] = Column(
Boolean,
index=True,
default=False,
doc="True if the issue did not exist before this instance",
)
run_id = Column(BIGDBIDType, nullable=False, index=True)
issue_id = Column(BIGDBIDType, nullable=False, index=True)
issue = relationship(
"Issue",
primaryjoin="foreign(Issue.id) == IssueInstance.issue_id",
uselist=False,
viewonly=True,
)
fix_info_id = Column(BIGDBIDType, nullable=True)
fix_info = relationship(
"IssueInstanceFixInfo",
primaryjoin=("foreign(IssueInstanceFixInfo.id) == IssueInstance.fix_info_id"),
uselist=False,
viewonly=True,
)
message_id = Column(BIGDBIDType, nullable=True)
message = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == IssueInstance.message_id",
uselist=False,
viewonly=True,
)
trace_frames = association_proxy("issue_instance_trace_frame", "trace_frame")
issue_instance_trace_frame = relationship(
"IssueInstanceTraceFrameAssoc",
primaryjoin=(
"IssueInstance.id == "
"foreign(IssueInstanceTraceFrameAssoc.issue_instance_id)"
),
viewonly=True,
)
shared_texts = association_proxy("issue_instance_shared_text", "shared_text")
issue_instance_shared_text = relationship(
"IssueInstanceSharedTextAssoc",
primaryjoin=(
"IssueInstance.id == "
"foreign(IssueInstanceSharedTextAssoc.issue_instance_id)"
),
viewonly=True,
)
features = association_proxy("issue_instance_feature", "features")
issue_instance_feature = relationship(
"IssueInstanceFeatureAssoc",
primaryjoin=(
"IssueInstance.id == "
"foreign(IssueInstanceFeatureAssoc.issue_instance_id)"
),
viewonly=True,
)
min_trace_length_to_sources: Column[Optional[int]] = Column(
Integer, nullable=True, doc="The minimum trace length to sources"
)
min_trace_length_to_sinks: Column[Optional[int]] = Column(
Integer, nullable=True, doc="The minimum trace length to sinks"
)
rank: Column[Optional[int]] = Column(
Integer,
server_default="0",
doc="The higher the rank, the higher the priority for this issue",
)
callable_count: Column[Optional[int]] = Column(
Integer,
server_default="0",
doc="Number of issues in this callable for this run",
)
min_trace_length_to_entrypoints: Column[Optional[int]] = Column(
Integer, nullable=True, doc="The minimum trace length to entrypoints"
)
def get_shared_texts_by_kind(self, kind: SharedTextKind) -> List[SharedText]:
return [text for text in self.shared_texts if text.kind == kind]
def get_trace_frames_by_kind(self, kind: TraceKind):
return [frame for frame in self.trace_frames if frame.kind == kind]
@classmethod
def merge(cls, session, items):
for i in items:
# If the issue is new, then the instance has to be new. But note
# that we still may need RunDiffer, because issues that disappeared
# for a while and then came back are also marked new.
i.is_new_issue = i.issue_id.is_new
yield i
class IssueStatus(enum.Enum):
"""Issues are born uncategorized. Humans can
set it to FALSE_POSITIVE or VALID_BUG upon review."""
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
"""An issue that hasn't been marked as a bug or FP"""
uncategorized = enum.auto()
"""Not a security bug, but a bad practice. Still needs fixing."""
bad_practice = enum.auto()
"""False positive from analysis"""
false_positive = enum.auto()
"""Reviewed and seen to be a valid bug that needs fixing"""
valid_bug = enum.auto()
"""I don't care about this particular issue,
but still want to see issues of this kind."""
do_not_care = enum.auto()
@classproperty
def UNCATEGORIZED(cls) -> "IssueStatus": # noqa
return cls.uncategorized
@classproperty
def BAD_PRACTICE(cls) -> "IssueStatus": # noqa
return cls.bad_practice
@classproperty
def FALSE_POSITIVE(cls) -> "IssueStatus": # noqa
return cls.false_positive
@classproperty
def VALID_BUG(cls) -> "IssueStatus": # noqa
return cls.valid_bug
@classproperty
def DO_NOT_CARE(cls) -> "IssueStatus": # noqa
return cls.do_not_care
class Issue(Base, PrepareMixin, MutableRecordMixin):
"""An issue coming from the static analysis.
An issue can persist across multiple runs, even if it moves around in the
code.
"""
__tablename__ = "issues"
# pyre-fixme[8]: Attribute has type `IssueDBID`; used as `Column[typing.Any]`.
id: IssueDBID = Column(IssueBIGDBIDType, primary_key=True, nullable=False)
handle: Column[str] = Column(
String(length=HANDLE_LENGTH),
nullable=False,
unique=True,
doc="This handle should uniquely identify an issue across runs on "
+ "different code revisions",
)
code: Column[int] = Column(
Integer, doc="Code identifiying the issue type", nullable=False, index=True
)
instances = relationship(
"IssueInstance",
primaryjoin="Issue.id == foreign(IssueInstance.issue_id)",
viewonly=True,
)
first_seen: Column[datetime] = Column(
DateTime,
doc="time of the first run that found this issue",
nullable=False,
index=True,
)
status: Column[str] = Column(
Enum(IssueStatus),
doc="Shows the issue status from the latest run",
server_default="uncategorized",
nullable=False,
index=True,
)
task_number: Column[Optional[int]] = Column(
Integer, doc="Task number (not fbid) that is tracking this issue"
)
triage_history_fbid: Column[Optional[int]] = Column(
BIGINT(unsigned=True),
nullable=True,
doc="FBID for EntZoncolanIssueTriageHistory",
)
feedback_fbid: Column[Optional[int]] = Column(
BIGINT(unsigned=True), nullable=True, doc="FBID for EntZoncolanFeedback"
)
detected_time: Column[int] = Column(
BIGINT(20, unsigned=True),
doc="unix timestamp of first detection",
nullable=False,
index=True,
server_default="0",
)
triage_time: Column[Optional[int]] = Column(
BIGINT(20, unsigned=True),
doc="unix timestamp of triage (typically first triage from history)",
nullable=True,
)
start_triage_time: Column[Optional[int]] = Column(
BIGINT(20, unsigned=True),
doc="unix timestamp of examination leading to triage",
nullable=True,
)
triage_duration: Column[int] = Column(
BIGINT(20, unsigned=True),
doc="duration in seconds spent triaging",
nullable=False,
server_default="0",
)
triaged_by_fbid: Column[Optional[int]] = Column(
BIGINT(unsigned=True),
nullable=True,
doc="FBID for EntInternUser (typically actor of first triage from history)",
)
@classmethod
def _take(cls, n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
@classmethod
def merge(cls, session, issues):
return cls._merge_by_key(session, issues, cls.handle)
class RunStatus(enum.Enum):
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
finished = enum.auto()
incomplete = enum.auto()
skipped = enum.auto()
failed = enum.auto()
@classproperty
def FINISHED(cls) -> "RunStatus": # noqa
return cls.finished
@classproperty
def INCOMPLETE(cls) -> "RunStatus": # noqa
return cls.incomplete
@classproperty
def SKIPPED(cls) -> "RunStatus": # noqa
return cls.skipped
@classproperty
def FAILED(cls) -> "RunStatus": # noqa
return cls.failed
class PurgeStatus(enum.Enum):
"Purge status of a run"
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
# Run has not been touched by purging automation
unpurged = enum.auto()
# Issue instances associated with an untriaged issue have been deleted
# Trace frames marked as UNREACHABLE have been deleted
purged = enum.auto()
# Trace frames not reachable by an issue instance whose issue is triaged have been
# marked UNREACHABLE.
ready_to_purge = enum.auto()
@classproperty
def UNPURGED(cls) -> str: # noqa
# pyre-ignore[7]: Coerce to string for SQLAlchemy
return cls.unpurged
@classproperty
def READY_TO_PURGE(cls) -> str: # noqa
# pyre-ignore[7]: Coerce to string for SQLAlchemy
return cls.ready_to_purge
@classproperty
def PURGED(cls) -> str: # noqa
# pyre-ignore[7]: Coerce to string for SQLAlchemy
return cls.purged
class FrameReachability(enum.Enum):
"Internal reachability status of a trace frame"
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
unreachable = enum.auto()
reachable = enum.auto()
@classproperty
def UNREACHABLE(cls) -> str: # noqa
# pyre-ignore[7]: Coerce to string for SQLAlchemy
return cls.unreachable
@classproperty
def REACHABLE(cls) -> str: # noqa
# pyre-ignore[7]: Coerce to string for SQLAlchemy
return cls.reachable
CURRENT_DB_VERSION = 1
class Run(Base):
"""A particular run of the static analyzer.
Each time output is parsed from the static analyzer we generate a new run. A
run has multiple IssueInstances."""
__tablename__ = "runs"
id = Column(BIGDBIDType, primary_key=True)
job_id: Column[Optional[str]] = Column(String(length=255), index=True)
date: Column[datetime] = Column(
DateTime, doc="The date/time the analysis was run", nullable=False
)
commit_hash: Column[Optional[str]] = Column(
String(length=255),
doc="The commit hash of the codebase",
nullable=True,
index=True,
)
revision_id: Column[Optional[int]] = Column(
Integer, doc="Phabricator Diff number (DXXXXXX)", nullable=True, index=True
)
differential_id: Column[Optional[int]] = Column(
Integer,
doc="Phabricator Version number",
nullable=True,
index=True,
)
hh_version: Column[Optional[str]] = Column(
String(length=255), doc="The output of hh_server --version"
)
branch: Column[Optional[str]] = Column(
String(length=255),
doc="Branch the commit is based on",
nullable=True,
index=True,
)
issue_instances = relationship(
"IssueInstance",
primaryjoin="Run.id == foreign(IssueInstance.run_id)",
backref="run",
viewonly=True,
)
status: Column[str] = Column(
Enum(RunStatus), server_default="finished", nullable=False, index=True
)
status_description: Column[Optional[str]] = Column(
String(length=255), doc="The reason why a run didn't finish", nullable=True
)
kind: Column[Optional[str]] = Column(
String(length=255),
doc=(
"Specify different kinds of runs, e.g. MASTER vs. TEST., GKFORXXX, etc. "
"in the same DB"
),
nullable=True,
index=True,
)
repository: Column[Optional[str]] = Column(
String(length=255),
doc=("The repository that static analysis was run on."),
nullable=True,
)
db_version: Column[int] = Column(
Integer,
doc="Tracks under which DB version this was written (for migrations)",
nullable=False,
default=CURRENT_DB_VERSION,
server_default="0",
)
purge_status: Column[str] = Column(
Enum(PurgeStatus),
server_default="unpurged",
nullable=False,
doc="Tracks whether Internal deletion jobs have purged unnecessary issue instances "
+ "and trace frames from this run. Should NOT be set to anything but the default in SAPP code.",
)
def get_summary(self, **kwargs) -> RunSummary:
session = Session.object_session(self)
return RunSummary(
commit_hash=self.commit_hash,
differential_id=self.differential_id,
id=self.id.resolved(),
job_id=self.job_id,
num_new_issues=self._get_num_new_issue_instances(session),
num_total_issues=self._get_num_total_issues(session),
alarm_counts=self._get_alarm_counts(session),
)
def _get_num_new_issue_instances(self, session) -> int:
return (
session.query(IssueInstance)
.filter(IssueInstance.run_id == self.id)
.filter(IssueInstance.is_new_issue.is_(True))
.count()
)
def _get_num_total_issues(self, session) -> int:
return (
session.query(IssueInstance).filter(IssueInstance.run_id == self.id).count()
)
def _get_alarm_counts(self, session) -> Dict[int, int]:
return dict(
session.query(Issue.code, func.count(Issue.code))
.filter(IssueInstance.run_id == self.id)
.outerjoin(IssueInstance.issue)
.group_by(Issue.code)
.all()
)
class MetaRun(Base):
"""An identifier that represents multiple runs which should be grouped semantically.
Meta-runs and runs have a many-to-many relationship, and the purpose of a meta-run
is to allow querying & displaying results for all related runs without having to
browse each of them separately."""
__tablename__ = "metaruns"
id = Column(BIGDBIDType, primary_key=True, autoincrement=False)
# This is the moral equivalent of job_id, but named in a more intuitive manner.
# Allows determining the latest meta run for each custom run separately.
custom_run_name: Column[Optional[str]] = Column(String(length=255), nullable=True)
date: Column[datetime] = Column(
DateTime, doc="The date/time the meta-run was generated", nullable=False
)
# We want to be able to filter meta-runs by completion. Towards that end, we plan on
# using the information of number of total runs vs. the number of runs written in
# the database.
expected_run_count: Column[Optional[int]] = Column(Integer, nullable=True)
kind: Column[Optional[str]] = Column(
String(length=255),
doc=(
"Specify different kinds of runs, e.g. MASTER vs. TEST., GKFORXXX, etc. "
"in the same DB"
),
nullable=True,
index=True,
)
db_version: Column[int] = Column(
Integer,
doc="Tracks under which DB version this was written (for migrations)",
nullable=False,
default=CURRENT_DB_VERSION,
)
status: Column[str] = Column(
Enum(RunStatus), server_default="finished", nullable=False, index=True
)
class RunSummary:
def __init__(
self,
commit_hash: Optional[str],
differential_id: Optional[int],
id: Optional[int],
job_id: Optional[str],
num_new_issues: int,
num_total_issues: int,
num_missing_preconditions: Optional[int] = None,
num_missing_postconditions: Optional[int] = None,
alarm_counts: Optional[Dict[int, int]] = None,
) -> None:
self.commit_hash = commit_hash
self.differential_id = differential_id
self.id = id
self.job_id = job_id
self.num_new_issues = num_new_issues
self.num_total_issues = num_total_issues
self.num_missing_preconditions = num_missing_preconditions
self.num_missing_postconditions = num_missing_postconditions
self.alarm_counts: Dict[int, int] = alarm_counts or {}
def todict(self) -> Dict[str, Any]:
return self.__dict__
@classmethod
def fromdict(cls, d) -> "RunSummary":
return cls(**d)
class MetaRunToRunAssoc(Base, PrepareMixin, RecordMixin):
"""The responsibility of filling out the meta-run to run assoc is on the child jobs
of a larger run.
"""
__tablename__ = "metarun_run_assoc"
meta_run_id = Column(BIGDBIDType, nullable=False, primary_key=True)
run_id = Column(BIGDBIDType, nullable=False, primary_key=True)
meta_run = relationship(
"MetaRun",
primaryjoin=("MetaRunToRunAssoc.meta_run_id == foreign(MetaRun.id)"),
uselist=False,
viewonly=True,
)
run = relationship(
"Run",
primaryjoin=("MetaRunToRunAssoc.run_id == foreign(Run.id)"),
uselist=False,
viewonly=True,
)
run_label = Column(
String(length=1024),
nullable=True,
doc="Optional label associated with a child run (eg. Buck target)",
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(session, items, cls.meta_run_id, cls.run_id)
class TraceFrameLeafAssoc(Base, PrepareMixin, RecordMixin):
__tablename__ = "trace_frame_message_assoc"
trace_frame_id = Column(BIGDBIDType, nullable=False, primary_key=True)
leaf_id = Column("message_id", BIGDBIDType, nullable=False, primary_key=True)
# The minimum trace length unfortunately can be off and actually lead to
# loops. This is a known problem and any code generating traces should
# additionally have cycle detection.
trace_length: Column[Optional[int]] = Column(
Integer, doc="minimum trace length to the given leaf", nullable=True
)
trace_frame = relationship(
"TraceFrame",
primaryjoin=("TraceFrameLeafAssoc.trace_frame_id == foreign(TraceFrame.id)"),
uselist=False,
viewonly=True,
)
leaves = relationship(
"SharedText",
primaryjoin="TraceFrameLeafAssoc.leaf_id == foreign(SharedText.id)",
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(session, items, cls.trace_frame_id, cls.leaf_id)
class IssueInstanceFixInfo(Base, PrepareMixin, RecordMixin):
__tablename__ = "issue_instance_fix_info"
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, nullable=False, primary_key=True)
fix_info: Column[str] = Column(
String(length=INNODB_MAX_INDEX_LENGTH), nullable=False
)
issue_instance = relationship(
"IssueInstance",
primaryjoin=("foreign(IssueInstance.fix_info_id) == IssueInstanceFixInfo.id"),
uselist=False,
viewonly=True,
)
class TraceFrame(Base, PrepareMixin, RecordMixin):
__tablename__ = "trace_frames"
__table_args__ = (
Index("ix_traceframe_run_caller_port", "run_id", "caller_id", "caller_port"),
Index("ix_traceframe_run_callee_port", "run_id", "callee_id", "callee_port"),
)
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, nullable=False, primary_key=True)
kind: Column[str] = Column(Enum(TraceKind), nullable=False, index=False)
caller_id = Column(BIGDBIDType, nullable=False, server_default="0", default=0)
caller = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == TraceFrame.caller_id",
uselist=False,
viewonly=True,
)
# pyre-fixme[8]: Attribute has type `str`; used as `Column[str]`.
caller_port: str = Column(
String(length=INNODB_MAX_INDEX_LENGTH),
nullable=False,
server_default="",
doc="The caller port of this call edge",
)
callee_id = Column(BIGDBIDType, nullable=False, server_default="0", default=0)
callee = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == TraceFrame.callee_id",
uselist=False,
viewonly=True,
)
callee_location = Column(
SourceLocationType,
nullable=False,
doc="The location of the callee in the source code (line|start|end)",
)
# pyre-fixme[8]: Attribute has type `str`; used as `Column[str]`.
callee_port: str = Column(
String(length=INNODB_MAX_INDEX_LENGTH),
nullable=False,
server_default="",
doc="The callee port of this call edge'",
)
filename_id = Column(BIGDBIDType, nullable=False, server_default="0", default=0)
run_id = Column("run_id", BIGDBIDType, nullable=False, index=False)
type_interval_lower: Column[Optional[int]] = Column(
Integer, nullable=True, doc="Class interval lower-bound (inclusive)"
)
type_interval_upper: Column[Optional[int]] = Column(
Integer, nullable=True, doc="Class interval upper-bound (inclusive)"
)
preserves_type_context: Column[bool] = Column(
Boolean,
default=False,
server_default="0",
nullable=False,
doc="Whether the call preserves the calling type context",
)
titos = Column(
SourceLocationsType,
doc="Locations of TITOs aka abductions for the trace frame",
nullable=False,
server_default="",
)
reachability: Column[str] = Column(
Enum(FrameReachability),
server_default="unreachable",
nullable=False,
doc="Reachability of this trace frame, for deletion purposes. "
+ "Is set by internal jobs and should NOT be set to anything but the default in SAPP code.",
)
annotations = relationship(
"TraceFrameAnnotation",
primaryjoin=("TraceFrame.id == foreign(TraceFrameAnnotation.trace_frame_id)"),
uselist=True,
viewonly=True,
)
leaves = association_proxy("leaf_assoc", "leaves")
lengths = association_proxy("leaf_assoc", "trace_length")
leaf_assoc = relationship(
"TraceFrameLeafAssoc",
primaryjoin=("TraceFrame.id == foreign(TraceFrameLeafAssoc.trace_frame_id)"),
uselist=True,
viewonly=True,
)
issue_instances = association_proxy("trace_frame_issue_instance", "issue_instance")
trace_frame_issue_instance = relationship(
"IssueInstanceTraceFrameAssoc",
primaryjoin=(
"TraceFrame.id == foreign(IssueInstanceTraceFrameAssoc.trace_frame_id)"
),
viewonly=True,
)
leaf_mapping: Set[LeafMapping] = set()
@staticmethod
def type_intervals_match_or_ignored(
caller_start: Optional[int],
caller_end: Optional[int],
caller_preserves: Optional[bool],
callee_start: Optional[int],
callee_end: Optional[int],
callee_preserves: Optional[bool],
) -> bool:
"""
returns whether or not to filter based on comparing the type intervals between
the "caller" trace_frame and the "callee" trace_frame.
This works both backwards and forwards
"""
if (
caller_start is None
or caller_end is None
or callee_start is None
or callee_end is None
or not callee_preserves
):
# in this case we cannot filter out frames
return True
assert caller_start <= caller_end
assert callee_start <= callee_end
if caller_start <= callee_start and callee_end <= caller_end:
# we have a match so we don't filter out the frame
# in other words for this callee frame the callee is a subset
# (or the same type) of the callee
return True
# we can filter out and we don't have a match return
# no-match
# Note that this can happen in a 2 cases
# (In both cases the caller and callee frams are part of the same base type
# since we know that the callee 'preserves type')
# 1. the caller is subset of the callee frame.
# (e.g. we know the caller is Dog and and the callee could have a trace-frame that
# allows any animal to traverse.)
# 2. the caller is an adjacent type
# (e.g. we know the caller is a Dog and the callee could have a Cat option
# that needs to be filtered.)
return False
# Extra bits of information we can show on a TraceFrame.
# This may be a message description, or it may be the start of another series
# of traces leading to some other leaf. TraceFrameAnnotationTraceFrameAssoc
# contains the first hop towards that leaf..
class TraceFrameAnnotation(Base, PrepareMixin, RecordMixin):
__tablename__ = "trace_frame_annotations"
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
id: DBID = Column(BIGDBIDType, nullable=False, primary_key=True)
location = Column(
SourceLocationType, nullable=False, doc="The location for the message"
)
kind: Column[Optional[str]] = Column(String(length=255), nullable=True, index=True)
# pyre-fixme[8]: Attribute has type `str`; used as `Column[str]`.
message: str = Column(
String(length=4096),
doc="Message describing info about the trace",
nullable=False,
)
leaf_id = Column(BIGDBIDType, nullable=True)
leaf = relationship(
"SharedText",
primaryjoin="foreign(SharedText.id) == TraceFrameAnnotation.leaf_id",
uselist=False,
viewonly=True,
)
# pyre-fixme[8]: Attribute has type `Optional[str]`; used as `Column[str]`.
link: Optional[str] = Column(
String(length=4096),
doc="An optional URL linking the message to more info (Quandary)",
nullable=True,
)
# pyre-fixme[8]: Attribute has type `Optional[str]`; used as `Column[str]`.
trace_key: Optional[str] = Column(
String(length=INNODB_MAX_INDEX_LENGTH),
nullable=True,
doc="Link to possible pre/post traces (caller_condition).",
)
# pyre-fixme[8]: Attribute has type `DBID`; used as `Column[typing.Any]`.
trace_frame_id: DBID = Column(BIGDBIDType, nullable=False, index=True)
trace_frame = relationship(
"TraceFrame",
primaryjoin=("TraceFrame.id == foreign(TraceFrameAnnotation.trace_frame_id)"),
uselist=True,
viewonly=True,
)
child_trace_frames = association_proxy(
"trace_frame_annotation_trace_frame", "trace_frame"
)
trace_frame_annotation_trace_frame = relationship(
"TraceFrameAnnotationTraceFrameAssoc",
primaryjoin=(
"TraceFrameAnnotation.id == "
"foreign(TraceFrameAnnotationTraceFrameAssoc.trace_frame_annotation_id)"
),
viewonly=True,
)
# A TraceFrameAnnotation may indicate more traces branching out from a trace
# frame towards a different leaf/trace kind. In that case, this assoc describes
# the first hop trace frame from the annotation. It is similar to
# IssueInstanceTraceFrameAssoc, which indicates the first hop trace frame from
# the issue instance.
class TraceFrameAnnotationTraceFrameAssoc(Base, PrepareMixin, RecordMixin):
__tablename__ = "trace_frame_annotation_trace_frame_assoc"
trace_frame_annotation_id = Column(
"trace_frame_annotation_id", BIGDBIDType, primary_key=True, nullable=False
)
trace_frame_id = Column(
"trace_frame_id", BIGDBIDType, primary_key=True, nullable=False, index=True
)
trace_frame_annotation = relationship(
"TraceFrameAnnotation",
primaryjoin=(
"TraceFrameAnnotationTraceFrameAssoc.trace_frame_annotation_id == "
"foreign(TraceFrameAnnotation.id)"
),
uselist=False,
viewonly=True,
)
trace_frame = relationship(
"TraceFrame",
primaryjoin=(
"TraceFrameAnnotationTraceFrameAssoc.trace_frame_id == "
"foreign(TraceFrame.id)"
),
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_assocs(
session, items, cls.trace_frame_annotation_id, cls.trace_frame_id
)
class WarningMessage(Base):
__tablename__ = "warning_messages"
code: Column[int] = Column(Integer, autoincrement=False, primary_key=True)
message: Column[str] = Column(String(length=4096), nullable=False)
class WarningCodeCategory(enum.Enum):
# Do NOT reorder the enums. Depending on the type of database, existing
# DBs may have these enums represented internally as ints based on the
# order shown here, and changing it here messes up existing data. This
# also means that new enums should be added AT THE END of the list.
bug = enum.auto()
code_smell = enum.auto()
@classproperty
def BUG(cls) -> "WarningCodeCategory": # noqa
return cls.bug
@classproperty
def CODE_SMELL(cls) -> "WarningCodeCategory": # noqa
return cls.code_smell
class WarningCodeProperties(Base):
"""Contains properties describing each warning code"""
__tablename__ = "warning_code_properties"
code: Column[int] = Column(
Integer,
autoincrement=False,
nullable=False,
primary_key=True,
doc="Code identifiying the issue type",
)
category: Column[Optional[str]] = Column(
Enum(WarningCodeCategory),
nullable=True,
index=False,
# pyre-fixme[6]: Expected `str` for 4th param but got `Tuple[str]`.
doc=(
"The category of problems that issues in with this warning code "
"can result in ",
),
)
new_issue_rate: Column[Optional[Decimal]] = Column(
Float,
nullable=True,
index=False,
doc="Average number of new issues per day (computed column)",
)
bug_count: Column[Optional[int]] = Column(
Integer,
nullable=True,
index=False,
doc="Number of issues in this category (computed column)",
)
avg_trace_len: Column[Optional[Decimal]] = Column(
Float, nullable=True, index=False, doc="Deprecated. See avg_fwd/bwd_trace_len"
)
avg_fwd_trace_len: Column[Optional[Decimal]] = Column(
Float,
nullable=True,
index=False,
# pyre-fixme[6]: Expected `str` for 4th param but got `Tuple[str]`.
doc=(
"Average (min) length of forward traces for the given warning code "
"(computed column)",
),
)
avg_bwd_trace_len: Column[Optional[Decimal]] = Column(
Float,
nullable=True,
index=False,
# pyre-fixme[6]: Expected `str` for 4th param but got `Tuple[str]`.
doc=(
"Average (min) length of backward traces for the given warning "
"code (computed column)",
),
)
snr: Column[Optional[Decimal]] = Column(
Float,
nullable=True,
index=False,
doc=(
"Signal to noise ratio based on triaged issues (computed column). "
"Ratio of (valid + bad practice) to (false positive + don't care)"
),
)
is_snr_significant: Column[Optional[bool]] = Column(
Boolean,
nullable=True,
index=False,
doc=(
"True if we are confident about the snr (computed column). "
"Depends on percentage of triaged issues and number of issues."
),
)
discoverable: Column[Optional[bool]] = Column(
Boolean,
nullable=True,
index=False,
doc="True if an attacker can discover the issue",
)
health_score: Column[Optional[Decimal]] = Column(
Float,
nullable=True,
index=False,
doc=(
"Scoring for the health of the warning code, between 0 and 1, "
"based on the values in the other columns (computed column)"
),
)
notes: Column[Optional[str]] = Column(
String(length=4096),
nullable=True,
index=False,
doc="Free form field for note-taking",
)
class RunOrigin(Base, PrepareMixin, RecordMixin):
"""This table associates runs with metadata concerning how the run was built, which we
call run origins. An example of run origins is Buck targets."""
__tablename__ = "run_origins"
id = Column(BIGDBIDType, nullable=False, primary_key=True)
run_id = Column(BIGDBIDType, nullable=False, index=True)
origin = Column(String(length=255), nullable=False)
run = relationship(
"Run",
primaryjoin=("RunOrigin.run_id == foreign(Run.id)"),
uselist=False,
viewonly=True,
)
@classmethod
def merge(cls, session, items):
return cls._merge_by_key(session, items, cls.run_id)
class PrimaryKey(Base, PrimaryKeyBase):
pass
class PrimaryKeyGenerator(PrimaryKeyGeneratorBase):
PRIMARY_KEY: Type = PrimaryKey
QUERY_CLASSES: Set[Type] = {
Issue,
IssueInstance,
IssueInstanceFixInfo,
SharedText,
Run,
TraceFrame,
TraceFrameAnnotation,
Feature,
}
def create(db: DB) -> None:
try:
Base.metadata.create_all(db.engine)
except NoSuchTableError:
pass
convert_sqlalchemy_type.register(SourceLocationType)(convert_column_to_string)
convert_sqlalchemy_type.register(BIGDBIDType)(convert_column_to_int_or_id)