# spark-shell
scala> val text = sc.textFile("hdfs://localhost:9000/tmp/Input/kubernetes.txt")
text: org.apache.spark.rdd.RDD[String] = hdfs://localhost:9000/tmp/Input/kubernetes.txt MapPartitionsRDD[3] at textFile at :23
scala> text.collect;
res1: Array[String] = Array(https://www.youtube.com/watch?v=o6bxo0Oeg6o&t=130s, https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/, install-kubeadm/, "", Installing a container runtime, Install Docker Engine on Ubuntu, =============, 1.Set up Docker's apt repository., "", # Add Docker's official GPG key:, sudo apt-get update, sudo apt-get install ca-certificates curl gnupg, sudo install -m 0755 -d /etc/apt/keyrings, curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg, sudo chmod a+r /etc/apt/keyrings/docker.gpg, "", # Add the repository to Apt sources:, echo \, " "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu ...
scala> val counts = text.flatMap(line => line.split(" "))
counts: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[4] at flatMap at :23
scala> counts.collect;
res2: Array[String] = Array(https://www.youtube.com/watch?v=o6bxo0Oeg6o&t=130s, https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/, install-kubeadm/, "", Installing, a, container, runtime, Install, Docker, Engine, on, Ubuntu, =============, 1.Set, up, Docker's, apt, repository., "", #, Add, Docker's, official, GPG, key:, sudo, apt-get, update, sudo, apt-get, install, ca-certificates, curl, gnupg, sudo, install, -m, 0755, -d, /etc/apt/keyrings, curl, -fsSL, https://download.docker.com/linux/ubuntu/gpg, |, sudo, gpg, --dearmor, -o, /etc/apt/keyrings/docker.gpg, sudo, chmod, a+r, /etc/apt/keyrings/docker.gpg, "", #, Add, the, repository, to, Apt, sources:, echo, \, "", "", "deb, [arch="$(dpkg, --print-architecture)", signed-by=/etc/apt/keyrings...
scala> val mapf = counts.map(word => (word,1))
mapf: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at map at :23
scala> mapf.collect
res3: Array[(String, Int)] = Array((https://www.youtube.com/watch?v=o6bxo0Oeg6o&t=130s,1), (https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/,1), (install-kubeadm/,1), ("",1), (Installing,1), (a,1), (container,1), (runtime,1), (Install,1), (Docker,1), (Engine,1), (on,1), (Ubuntu,1), (=============,1), (1.Set,1), (up,1), (Docker's,1), (apt,1), (repository.,1), ("",1), (#,1), (Add,1), (Docker's,1), (official,1), (GPG,1), (key:,1), (sudo,1), (apt-get,1), (update,1), (sudo,1), (apt-get,1), (install,1), (ca-certificates,1), (curl,1), (gnupg,1), (sudo,1), (install,1), (-m,1), (0755,1), (-d,1), (/etc/apt/keyrings,1), (curl,1), (-fsSL,1), (https://download.docker.com/linux/ubuntu/gpg,1), (|,1), (sudo,1), (gpg,1), (--dearmor,1), (-o,1), (/etc/apt/ke...
scala> val reducef = mapf.reduceByKey(_+_);
reducef: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[6] at reduceByKey at :23
scala> reducef.collect
res4: Array[(String, Int)] = Array((package,4), (index,1), (cluster.,1), (kube-scheduler-k8smaster-vm,2), ("$(.,1), (-e,1), (/',1), (/etc/kubernetes/admin.conf,1), (/etc/os-release,1), (This,1), (repository.,1), ([signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg],1), (RESTARTS,2), (kube-flannel,1), (kube-apiserver-k8smaster-vm,2), (daemon-reload,1), (export,2), (gpg,3), (already,1), (any,2), (go,1), (make,1), (network,1), (Download,2), (git,1), (control-plane,1), (4.,2), (packaging/systemd/*,1), (-o,3), (are,1), ("kubectl,1), (2.,2), (sha256:b058fc69cbec62d085bb38d84f0a89879cbe16068567f061a8fac84f87eab9aa,1), ([podnetwork].yaml",1), (https://download.docker.com/linux/ubuntu/gpg,1), (STATUS,2), (kubelet,3), (overwrites,1), (commands,1), (can,3), (tee,2), (...