Skip to content

Apache Cassandra

Gustavo Armenta edited this page May 16, 2017 · 15 revisions
  • Apple has 75,000+ nodes running Cassandra in 2015
  • Data replication in same datacenter takes 150ms
  • Data replication from US to Asia takes 750ms
  • Don't store large files. Cassandra has a 2GB limit per row. In reality, you should test and probably work with 100MB chunk size files.
  • Keep partition size under 100MB
  • Keep collection columns under 1,000 items
  • Create index lookup tables for collection columns
  • Create Data Aggregations by doing PK, Year, Month, Day, Hour, Counter, Sum
  • Use BATCH to insert entity in many tables
  • Use UPDATE-WHERE-IF pattern to access data only once (e.g. reset password scenario)
CREATE KEYSPACE killrvideo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
USE killrvideo;

CREATE TABLE videos (
 video_id timeuuid,
 added_date timestamp,
 description text,
 title text,
 user_id uuid,
 PRIMARY KEY ((video_id)));
COPY videos FROM 'labwork/exercise-2/videos.csv' WITH HEADER=true;
SELECT COUNT(*) FROM videos;
SELECT * FROM videos LIMIT 10;

CREATE TABLE videos_by_title_year (
 title text,
 added_year int,
 added_date timestamp,
 description text,
 user_id uuid,
 video_id uuid,
 PRIMARY KEY ((title, added_year)));
COPY videos_by_title_year FROM 'labwork/exercise-3/videos_by_title_year.csv' WITH HEADER=true;
SELECT * FROM videos_by_title_year LIMIT 10;
SELECT * FROM videos_by_title_year WHERE title='Chicago Demo';
SELECT * FROM videos_by_title_year WHERE title='Chicago Demo' AND added_year=2015;

CREATE TABLE videos_by_tag_year (
 tag text,
 added_year int,
 video_id timeuuid,
 added_date timestamp,
 description text,
 title text,
 user_id uuid,
 PRIMARY KEY ((tag), added_year, video_id)
) WITH CLUSTERING ORDER BY (added_year desc, video_id asc);
COPY videos_by_tag_year FROM 'labwork/exercise-4/videos_by_tag_year.csv' WITH HEADER=true;
SELECT * FROM videos_by_tag_year LIMIT 10;

DROP TABLE videos;
CREATE TABLE videos (
 video_id timeuuid,
 added_date timestamp,
 description text,
 tags set<text>,
 title text,
 user_id uuid,
 PRIMARY KEY((video_id)));
COPY videos FROM 'labwork/exercise-5/videos.csv' WITH HEADER=true;
CREATE TYPE video_encoding (
 bit_rates set<text>,
 encoding text,
 height int,
 width int);
ALTER TABLE videos ADD encoding frozen<video_encoding>;
COPY videos (video_id, encoding) FROM 'labwork/exercise-5/videos_encoding.csv' WITH HEADER=true;
SELECT * FROM videos LIMIT 10;

CREATE TABLE videos_count_by_tag (
 video_count counter,
 tag text,
 added_year int,
 PRIMARY KEY (tag, added_year));
SOURCE 'labwork/exercise-6/videos_count_by_tag.cql';
SELECT * FROM videos_count_by_tag;